diff options
-rw-r--r-- | Doc/library/tarfile.rst | 3 | ||||
-rw-r--r-- | Lib/tarfile.py | 231 | ||||
-rw-r--r-- | Lib/test/test_tarfile.py | 70 | ||||
-rw-r--r-- | Lib/test/testtar.tar | bin | 298496 -> 427008 bytes | |||
-rw-r--r-- | Misc/NEWS | 3 |
5 files changed, 180 insertions, 127 deletions
diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 853406c219..0dfb065252 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -20,7 +20,8 @@ Some facts and figures: * read/write support for the POSIX.1-1988 (ustar) format. * read/write support for the GNU tar format including *longname* and *longlink* - extensions, read-only support for the *sparse* extension. + extensions, read-only support for all variants of the *sparse* extension + including restoration of sparse files. * read/write support for the POSIX.1-2001 (pax) format. diff --git a/Lib/tarfile.py b/Lib/tarfile.py index cc7514d0a6..e33b982081 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -701,13 +701,29 @@ class _FileInFile(object): object. """ - def __init__(self, fileobj, offset, size, sparse=None): + def __init__(self, fileobj, offset, size, blockinfo=None): self.fileobj = fileobj self.offset = offset self.size = size - self.sparse = sparse self.position = 0 + if blockinfo is None: + blockinfo = [(0, size)] + + # Construct a map with data and zero blocks. + self.map_index = 0 + self.map = [] + lastpos = 0 + realpos = self.offset + for offset, size in blockinfo: + if offset > lastpos: + self.map.append((False, lastpos, offset, None)) + self.map.append((True, offset, offset + size, realpos)) + realpos += size + lastpos = offset + size + if lastpos < self.size: + self.map.append((False, lastpos, self.size, None)) + def seekable(self): if not hasattr(self.fileobj, "seekable"): # XXX gzip.GzipFile and bz2.BZ2File @@ -732,48 +748,26 @@ class _FileInFile(object): else: size = min(size, self.size - self.position) - if self.sparse is None: - return self.readnormal(size) - else: - return self.readsparse(size) - - def readnormal(self, size): - """Read operation for regular files. - """ - self.fileobj.seek(self.offset + self.position) - self.position += size - return self.fileobj.read(size) - - def readsparse(self, size): - """Read operation for sparse files. - """ - data = b"" + buf = b"" while size > 0: - buf = self.readsparsesection(size) - if not buf: - break - size -= len(buf) - data += buf - return data - - def readsparsesection(self, size): - """Read a single section of a sparse file. - """ - section = self.sparse.find(self.position) - - if section is None: - return b"" - - size = min(size, section.offset + section.size - self.position) - - if isinstance(section, _data): - realpos = section.realpos + self.position - section.offset - self.fileobj.seek(self.offset + realpos) - self.position += size - return self.fileobj.read(size) - else: - self.position += size - return NUL * size + while True: + data, start, stop, offset = self.map[self.map_index] + if start <= self.position < stop: + break + else: + self.map_index += 1 + if self.map_index == len(self.map): + self.map_index = 0 + length = min(size, stop - self.position) + if data: + self.fileobj.seek(offset) + block = self.fileobj.read(stop - start) + buf += block[self.position - start:self.position + length] + else: + buf += NUL * length + size -= length + self.position += length + return buf #class _FileInFile @@ -1367,28 +1361,15 @@ class TarInfo(object): numbytes = nti(buf[pos + 12:pos + 24]) except ValueError: break - structs.append((offset, numbytes)) + if offset and numbytes: + structs.append((offset, numbytes)) pos += 24 isextended = bool(buf[504]) - - # Transform the sparse structures to something we can use - # in ExFileObject. - self.sparse = _ringbuffer() - lastpos = 0 - realpos = 0 - for offset, numbytes in structs: - if offset > lastpos: - self.sparse.append(_hole(lastpos, offset - lastpos)) - self.sparse.append(_data(offset, numbytes, realpos)) - realpos += numbytes - lastpos = offset + numbytes - if lastpos < origsize: - self.sparse.append(_hole(lastpos, origsize - lastpos)) + self.sparse = structs self.offset_data = tarfile.fileobj.tell() tarfile.offset = self.offset_data + self._block(self.size) self.size = origsize - return self def _proc_pax(self, tarfile): @@ -1464,6 +1445,19 @@ class TarInfo(object): except HeaderError: raise SubsequentHeaderError("missing or bad subsequent header") + # Process GNU sparse information. + if "GNU.sparse.map" in pax_headers: + # GNU extended sparse format version 0.1. + self._proc_gnusparse_01(next, pax_headers) + + elif "GNU.sparse.size" in pax_headers: + # GNU extended sparse format version 0.0. + self._proc_gnusparse_00(next, pax_headers, buf) + + elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": + # GNU extended sparse format version 1.0. + self._proc_gnusparse_10(next, pax_headers, tarfile) + if self.type in (XHDTYPE, SOLARIS_XHDTYPE): # Patch the TarInfo object with the extended header info. next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) @@ -1480,24 +1474,59 @@ class TarInfo(object): return next + def _proc_gnusparse_00(self, next, pax_headers, buf): + """Process a GNU tar extended sparse header, version 0.0. + """ + offsets = [] + for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): + offsets.append(int(match.group(1))) + numbytes = [] + for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): + numbytes.append(int(match.group(1))) + next.sparse = list(zip(offsets, numbytes)) + + def _proc_gnusparse_01(self, next, pax_headers): + """Process a GNU tar extended sparse header, version 0.1. + """ + sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] + next.sparse = list(zip(sparse[::2], sparse[1::2])) + + def _proc_gnusparse_10(self, next, pax_headers, tarfile): + """Process a GNU tar extended sparse header, version 1.0. + """ + fields = None + sparse = [] + buf = tarfile.fileobj.read(BLOCKSIZE) + fields, buf = buf.split(b"\n", 1) + fields = int(fields) + while len(sparse) < fields * 2: + if b"\n" not in buf: + buf += tarfile.fileobj.read(BLOCKSIZE) + number, buf = buf.split(b"\n", 1) + sparse.append(int(number)) + next.offset_data = tarfile.fileobj.tell() + next.sparse = list(zip(sparse[::2], sparse[1::2])) + def _apply_pax_info(self, pax_headers, encoding, errors): """Replace fields with supplemental information from a previous pax extended or global header. """ for keyword, value in pax_headers.items(): - if keyword not in PAX_FIELDS: - continue - - if keyword == "path": - value = value.rstrip("/") - - if keyword in PAX_NUMBER_FIELDS: - try: - value = PAX_NUMBER_FIELDS[keyword](value) - except ValueError: - value = 0 - - setattr(self, keyword, value) + if keyword == "GNU.sparse.name": + setattr(self, "path", value) + elif keyword == "GNU.sparse.size": + setattr(self, "size", int(value)) + elif keyword == "GNU.sparse.realsize": + setattr(self, "size", int(value)) + elif keyword in PAX_FIELDS: + if keyword in PAX_NUMBER_FIELDS: + try: + value = PAX_NUMBER_FIELDS[keyword](value) + except ValueError: + value = 0 + if keyword == "path": + value = value.rstrip("/") + setattr(self, keyword, value) self.pax_headers = pax_headers.copy() @@ -1535,7 +1564,7 @@ class TarInfo(object): def isfifo(self): return self.type == FIFOTYPE def issparse(self): - return self.type == GNUTYPE_SPARSE + return self.sparse is not None def isdev(self): return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) # class TarInfo @@ -2255,10 +2284,17 @@ class TarFile(object): def makefile(self, tarinfo, targetpath): """Make a file called targetpath. """ - source = self.extractfile(tarinfo) + source = self.fileobj + source.seek(tarinfo.offset_data) target = bltn_open(targetpath, "wb") - copyfileobj(source, target) - source.close() + if tarinfo.sparse is not None: + for offset, size in tarinfo.sparse: + target.seek(offset) + copyfileobj(source, target, size) + else: + copyfileobj(source, target, tarinfo.size) + target.seek(tarinfo.size) + target.truncate() target.close() def makeunknown(self, tarinfo, targetpath): @@ -2544,49 +2580,6 @@ class TarIter: self.index += 1 return tarinfo -# Helper classes for sparse file support -class _section: - """Base class for _data and _hole. - """ - def __init__(self, offset, size): - self.offset = offset - self.size = size - def __contains__(self, offset): - return self.offset <= offset < self.offset + self.size - -class _data(_section): - """Represent a data section in a sparse file. - """ - def __init__(self, offset, size, realpos): - _section.__init__(self, offset, size) - self.realpos = realpos - -class _hole(_section): - """Represent a hole section in a sparse file. - """ - pass - -class _ringbuffer(list): - """Ringbuffer class which increases performance - over a regular list. - """ - def __init__(self): - self.idx = 0 - def find(self, offset): - idx = self.idx - while True: - item = self[idx] - if offset in item: - break - idx += 1 - if idx == len(self): - idx = 0 - if idx == self.idx: - # End of File - return None - self.idx = idx - return item - #-------------------- # exported functions #-------------------- diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 3a217dc815..8dc3ff9aa2 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -526,6 +526,22 @@ class MemberReadTest(ReadTest): tarinfo = self.tar.getmember("ustar/sparse") self._test_member(tarinfo, size=86016, chksum=md5_sparse) + def test_find_gnusparse(self): + tarinfo = self.tar.getmember("gnu/sparse") + self._test_member(tarinfo, size=86016, chksum=md5_sparse) + + def test_find_gnusparse_00(self): + tarinfo = self.tar.getmember("gnu/sparse-0.0") + self._test_member(tarinfo, size=86016, chksum=md5_sparse) + + def test_find_gnusparse_01(self): + tarinfo = self.tar.getmember("gnu/sparse-0.1") + self._test_member(tarinfo, size=86016, chksum=md5_sparse) + + def test_find_gnusparse_10(self): + tarinfo = self.tar.getmember("gnu/sparse-1.0") + self._test_member(tarinfo, size=86016, chksum=md5_sparse) + def test_find_umlauts(self): tarinfo = self.tar.getmember("ustar/umlauts-\xc4\xd6\xdc\xe4\xf6\xfc\xdf") self._test_member(tarinfo, size=7011, chksum=md5_regtype) @@ -589,13 +605,53 @@ class GNUReadTest(LongnameTest): subdir = "gnu" longnametype = tarfile.GNUTYPE_LONGNAME - def test_sparse_file(self): - tarinfo1 = self.tar.getmember("ustar/sparse") - fobj1 = self.tar.extractfile(tarinfo1) - tarinfo2 = self.tar.getmember("gnu/sparse") - fobj2 = self.tar.extractfile(tarinfo2) - self.assertEqual(fobj1.read(), fobj2.read(), - "sparse file extraction failed") + # Since 3.2 tarfile is supposed to accurately restore sparse members and + # produce files with holes. This is what we actually want to test here. + # Unfortunately, not all platforms/filesystems support sparse files, and + # even on platforms that do it is non-trivial to make reliable assertions + # about holes in files. Therefore, we first do one basic test which works + # an all platforms, and after that a test that will work only on + # platforms/filesystems that prove to support sparse files. + def _test_sparse_file(self, name): + self.tar.extract(name, TEMPDIR) + filename = os.path.join(TEMPDIR, name) + with open(filename, "rb") as fobj: + data = fobj.read() + self.assertEqual(md5sum(data), md5_sparse, + "wrong md5sum for %s" % name) + + if self._fs_supports_holes(): + s = os.stat(filename) + self.assertTrue(s.st_blocks * 512 < s.st_size) + + def test_sparse_file_old(self): + self._test_sparse_file("gnu/sparse") + + def test_sparse_file_00(self): + self._test_sparse_file("gnu/sparse-0.0") + + def test_sparse_file_01(self): + self._test_sparse_file("gnu/sparse-0.1") + + def test_sparse_file_10(self): + self._test_sparse_file("gnu/sparse-1.0") + + @staticmethod + def _fs_supports_holes(): + # Return True if the platform knows the st_blocks stat attribute and + # uses st_blocks units of 512 bytes, and if the filesystem is able to + # store holes in files. + if sys.platform == "linux2": + # Linux evidentially has 512 byte st_blocks units. + name = os.path.join(TEMPDIR, "sparse-test") + with open(name, "wb") as fobj: + fobj.seek(4096) + fobj.truncate() + s = os.stat(name) + os.remove(name) + return s.st_blocks == 0 + else: + return False class PaxReadTest(LongnameTest): diff --git a/Lib/test/testtar.tar b/Lib/test/testtar.tar Binary files differindex dc1942c19d..b93210453d 100644 --- a/Lib/test/testtar.tar +++ b/Lib/test/testtar.tar @@ -54,6 +54,9 @@ Core and Builtins Library ------- +- tarfile.py: Add support for all missing variants of the GNU sparse + extensions and create files with holes when extracting sparse members. + - Issue #10218: Return timeout status from ``Condition.wait`` in threading. - Issue #7351: Add ``zipfile.BadZipFile`` spelling of the exception name |