summaryrefslogtreecommitdiff
path: root/Lib/tarfile.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/tarfile.py')
-rw-r--r--Lib/tarfile.py680
1 files changed, 410 insertions, 270 deletions
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index beb4135724..0f9d1dade1 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
#-------------------------------------------------------------------
# tarfile.py
#-------------------------------------------------------------------
@@ -50,18 +50,20 @@ import struct
import copy
import re
-if sys.platform == 'mac':
- # This module needs work for MacOS9, especially in the area of pathname
- # handling. In many places it is assumed a simple substitution of / by the
- # local os.path.sep is good enough to convert pathnames, but this does not
- # work with the mac rooted:path:name versus :nonrooted:path:name syntax
- raise ImportError("tarfile does not work for platform==mac")
-
try:
import grp, pwd
except ImportError:
grp = pwd = None
+# os.symlink on Windows prior to 6.0 raises NotImplementedError
+symlink_exception = (AttributeError, NotImplementedError)
+try:
+ # WindowsError (1314) will be raised if the caller does not hold the
+ # SeCreateSymbolicLinkPrivilege privilege
+ symlink_exception += (WindowsError,)
+except NameError:
+ pass
+
# from tarfile import *
__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
@@ -125,6 +127,9 @@ GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
PAX_FIELDS = ("path", "linkpath", "size", "mtime",
"uid", "gid", "uname", "gname")
+# Fields from a pax header that are affected by hdrcharset.
+PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
+
# Fields in a pax header that are numbers, all other fields
# are treated as strings.
PAX_NUMBER_FIELDS = {
@@ -163,9 +168,10 @@ TOEXEC = 0o001 # execute/search by other
#---------------------------------------------------------
# initialization
#---------------------------------------------------------
-ENCODING = sys.getfilesystemencoding()
-if ENCODING is None:
- ENCODING = "ascii"
+if os.name in ("nt", "ce"):
+ ENCODING = "utf-8"
+else:
+ ENCODING = sys.getfilesystemencoding()
#---------------------------------------------------------
# Some useful functions
@@ -194,7 +200,7 @@ def nti(s):
try:
n = int(nts(s, "ascii", "strict") or "0", 8)
except ValueError:
- raise HeaderError("invalid header")
+ raise InvalidHeaderError("invalid header")
else:
n = 0
for i in range(len(s) - 1):
@@ -309,11 +315,6 @@ def filemode(mode):
perm.append("-")
return "".join(perm)
-if os.sep != "/":
- normpath = lambda path: os.path.normpath(path).replace(os.sep, "/")
-else:
- normpath = os.path.normpath
-
class TarError(Exception):
"""Base exception."""
pass
@@ -330,8 +331,23 @@ class StreamError(TarError):
"""Exception for unsupported operations on stream-like TarFiles."""
pass
class HeaderError(TarError):
+ """Base exception for header errors."""
+ pass
+class EmptyHeaderError(HeaderError):
+ """Exception for empty headers."""
+ pass
+class TruncatedHeaderError(HeaderError):
+ """Exception for truncated headers."""
+ pass
+class EOFHeaderError(HeaderError):
+ """Exception for end of file headers."""
+ pass
+class InvalidHeaderError(HeaderError):
"""Exception for invalid headers."""
pass
+class SubsequentHeaderError(HeaderError):
+ """Exception for missing and invalid extended headers."""
+ pass
#---------------------------
# internal stream interface
@@ -394,28 +410,34 @@ class _Stream:
self.pos = 0
self.closed = False
- if comptype == "gz":
- try:
- import zlib
- except ImportError:
- raise CompressionError("zlib module is not available")
- self.zlib = zlib
- self.crc = zlib.crc32(b"")
- if mode == "r":
- self._init_read_gz()
- else:
- self._init_write_gz()
+ try:
+ if comptype == "gz":
+ try:
+ import zlib
+ except ImportError:
+ raise CompressionError("zlib module is not available")
+ self.zlib = zlib
+ self.crc = zlib.crc32(b"")
+ if mode == "r":
+ self._init_read_gz()
+ else:
+ self._init_write_gz()
- if comptype == "bz2":
- try:
- import bz2
- except ImportError:
- raise CompressionError("bz2 module is not available")
- if mode == "r":
- self.dbuf = b""
- self.cmp = bz2.BZ2Decompressor()
- else:
- self.cmp = bz2.BZ2Compressor()
+ if comptype == "bz2":
+ try:
+ import bz2
+ except ImportError:
+ raise CompressionError("bz2 module is not available")
+ if mode == "r":
+ self.dbuf = b""
+ self.cmp = bz2.BZ2Decompressor()
+ else:
+ self.cmp = bz2.BZ2Compressor()
+ except:
+ if not self._extfileobj:
+ self.fileobj.close()
+ self.closed = True
+ raise
def __del__(self):
if hasattr(self, "closed") and not self.closed:
@@ -679,13 +701,29 @@ class _FileInFile(object):
object.
"""
- def __init__(self, fileobj, offset, size, sparse=None):
+ def __init__(self, fileobj, offset, size, blockinfo=None):
self.fileobj = fileobj
self.offset = offset
self.size = size
- self.sparse = sparse
self.position = 0
+ if blockinfo is None:
+ blockinfo = [(0, size)]
+
+ # Construct a map with data and zero blocks.
+ self.map_index = 0
+ self.map = []
+ lastpos = 0
+ realpos = self.offset
+ for offset, size in blockinfo:
+ if offset > lastpos:
+ self.map.append((False, lastpos, offset, None))
+ self.map.append((True, offset, offset + size, realpos))
+ realpos += size
+ lastpos = offset + size
+ if lastpos < self.size:
+ self.map.append((False, lastpos, self.size, None))
+
def seekable(self):
if not hasattr(self.fileobj, "seekable"):
# XXX gzip.GzipFile and bz2.BZ2File
@@ -710,48 +748,25 @@ class _FileInFile(object):
else:
size = min(size, self.size - self.position)
- if self.sparse is None:
- return self.readnormal(size)
- else:
- return self.readsparse(size)
-
- def readnormal(self, size):
- """Read operation for regular files.
- """
- self.fileobj.seek(self.offset + self.position)
- self.position += size
- return self.fileobj.read(size)
-
- def readsparse(self, size):
- """Read operation for sparse files.
- """
- data = b""
+ buf = b""
while size > 0:
- buf = self.readsparsesection(size)
- if not buf:
- break
- size -= len(buf)
- data += buf
- return data
-
- def readsparsesection(self, size):
- """Read a single section of a sparse file.
- """
- section = self.sparse.find(self.position)
-
- if section is None:
- return b""
-
- size = min(size, section.offset + section.size - self.position)
-
- if isinstance(section, _data):
- realpos = section.realpos + self.position - section.offset
- self.fileobj.seek(self.offset + realpos)
- self.position += size
- return self.fileobj.read(size)
- else:
- self.position += size
- return NUL * size
+ while True:
+ data, start, stop, offset = self.map[self.map_index]
+ if start <= self.position < stop:
+ break
+ else:
+ self.map_index += 1
+ if self.map_index == len(self.map):
+ self.map_index = 0
+ length = min(size, stop - self.position)
+ if data:
+ self.fileobj.seek(offset + (self.position - start))
+ buf += self.fileobj.read(length)
+ else:
+ buf += NUL * length
+ size -= length
+ self.position += length
+ return buf
#class _FileInFile
@@ -955,7 +970,7 @@ class TarInfo(object):
"""Return the TarInfo's attributes as a dictionary.
"""
info = {
- "name": normpath(self.name),
+ "name": self.name,
"mode": self.mode & 0o7777,
"uid": self.uid,
"gid": self.gid,
@@ -963,7 +978,7 @@ class TarInfo(object):
"mtime": self.mtime,
"chksum": self.chksum,
"type": self.type,
- "linkname": normpath(self.linkname) if self.linkname else "",
+ "linkname": self.linkname,
"uname": self.uname,
"gname": self.gname,
"devmajor": self.devmajor,
@@ -975,7 +990,7 @@ class TarInfo(object):
return info
- def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"):
+ def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
"""Return a tar header as a string of 512 byte blocks.
"""
info = self.get_info()
@@ -985,7 +1000,7 @@ class TarInfo(object):
elif format == GNU_FORMAT:
return self.create_gnu_header(info, encoding, errors)
elif format == PAX_FORMAT:
- return self.create_pax_header(info)
+ return self.create_pax_header(info, encoding)
else:
raise ValueError("invalid format")
@@ -1016,7 +1031,7 @@ class TarInfo(object):
return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
- def create_pax_header(self, info):
+ def create_pax_header(self, info, encoding):
"""Return the object as a ustar header block. If it cannot be
represented this way, prepend a pax extended header sequence
with supplement information.
@@ -1059,7 +1074,7 @@ class TarInfo(object):
# Create a pax extended header if necessary.
if pax_headers:
- buf = self._create_pax_generic_header(pax_headers, XHDTYPE)
+ buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
else:
buf = b""
@@ -1069,7 +1084,7 @@ class TarInfo(object):
def create_pax_global_header(cls, pax_headers):
"""Return the object as a pax global header block sequence.
"""
- return cls._create_pax_generic_header(pax_headers, XGLTYPE)
+ return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf8")
def _posix_split_name(self, name):
"""Split a name longer than 100 chars into a prefix
@@ -1142,15 +1157,35 @@ class TarInfo(object):
cls._create_payload(name)
@classmethod
- def _create_pax_generic_header(cls, pax_headers, type):
- """Return a POSIX.1-2001 extended or global header sequence
+ def _create_pax_generic_header(cls, pax_headers, type, encoding):
+ """Return a POSIX.1-2008 extended or global header sequence
that contains a list of keyword, value pairs. The values
must be strings.
"""
+ # Check if one of the fields contains surrogate characters and thereby
+ # forces hdrcharset=BINARY, see _proc_pax() for more information.
+ binary = False
+ for keyword, value in pax_headers.items():
+ try:
+ value.encode("utf8", "strict")
+ except UnicodeEncodeError:
+ binary = True
+ break
+
records = b""
+ if binary:
+ # Put the hdrcharset field at the beginning of the header.
+ records += b"21 hdrcharset=BINARY\n"
+
for keyword, value in pax_headers.items():
keyword = keyword.encode("utf8")
- value = value.encode("utf8")
+ if binary:
+ # Try to restore the original byte representation of `value'.
+ # Needless to say, that the encoding must match the string.
+ value = value.encode(encoding, "surrogateescape")
+ else:
+ value = value.encode("utf8")
+
l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
n = p = 0
while True:
@@ -1176,14 +1211,16 @@ class TarInfo(object):
def frombuf(cls, buf, encoding, errors):
"""Construct a TarInfo object from a 512 byte bytes object.
"""
+ if len(buf) == 0:
+ raise EmptyHeaderError("empty header")
if len(buf) != BLOCKSIZE:
- raise HeaderError("truncated header")
+ raise TruncatedHeaderError("truncated header")
if buf.count(NUL) == BLOCKSIZE:
- raise HeaderError("empty header")
+ raise EOFHeaderError("end of file header")
chksum = nti(buf[148:156])
if chksum not in calc_chksums(buf):
- raise HeaderError("bad checksum")
+ raise InvalidHeaderError("bad checksum")
obj = cls()
obj.name = nts(buf[0:100], encoding, errors)
@@ -1239,8 +1276,6 @@ class TarInfo(object):
tarfile.
"""
buf = tarfile.fileobj.read(BLOCKSIZE)
- if not buf:
- return
obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
return obj._proc_member(tarfile)
@@ -1293,9 +1328,10 @@ class TarInfo(object):
buf = tarfile.fileobj.read(self._block(self.size))
# Fetch the next header and process it.
- next = self.fromtarfile(tarfile)
- if next is None:
- raise HeaderError("missing subsequent header")
+ try:
+ next = self.fromtarfile(tarfile)
+ except HeaderError:
+ raise SubsequentHeaderError("missing or bad subsequent header")
# Patch the TarInfo object from the next header with
# the longname information.
@@ -1324,33 +1360,20 @@ class TarInfo(object):
numbytes = nti(buf[pos + 12:pos + 24])
except ValueError:
break
- structs.append((offset, numbytes))
+ if offset and numbytes:
+ structs.append((offset, numbytes))
pos += 24
isextended = bool(buf[504])
-
- # Transform the sparse structures to something we can use
- # in ExFileObject.
- self.sparse = _ringbuffer()
- lastpos = 0
- realpos = 0
- for offset, numbytes in structs:
- if offset > lastpos:
- self.sparse.append(_hole(lastpos, offset - lastpos))
- self.sparse.append(_data(offset, numbytes, realpos))
- realpos += numbytes
- lastpos = offset + numbytes
- if lastpos < origsize:
- self.sparse.append(_hole(lastpos, origsize - lastpos))
+ self.sparse = structs
self.offset_data = tarfile.fileobj.tell()
tarfile.offset = self.offset_data + self._block(self.size)
self.size = origsize
-
return self
def _proc_pax(self, tarfile):
"""Process an extended or global header as described in
- POSIX.1-2001.
+ POSIX.1-2008.
"""
# Read the header information.
buf = tarfile.fileobj.read(self._block(self.size))
@@ -1363,6 +1386,24 @@ class TarInfo(object):
else:
pax_headers = tarfile.pax_headers.copy()
+ # Check if the pax header contains a hdrcharset field. This tells us
+ # the encoding of the path, linkpath, uname and gname fields. Normally,
+ # these fields are UTF-8 encoded but since POSIX.1-2008 tar
+ # implementations are allowed to store them as raw binary strings if
+ # the translation to UTF-8 fails.
+ match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
+ if match is not None:
+ pax_headers["hdrcharset"] = match.group(1).decode("utf8")
+
+ # For the time being, we don't care about anything other than "BINARY".
+ # The only other value that is currently allowed by the standard is
+ # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
+ hdrcharset = pax_headers.get("hdrcharset")
+ if hdrcharset == "BINARY":
+ encoding = tarfile.encoding
+ else:
+ encoding = "utf8"
+
# Parse pax header information. A record looks like that:
# "%d %s=%s\n" % (length, keyword, value). length is the size
# of the complete record including the length field itself and
@@ -1378,19 +1419,45 @@ class TarInfo(object):
length = int(length)
value = buf[match.end(2) + 1:match.start(1) + length - 1]
- keyword = keyword.decode("utf8")
- value = value.decode("utf8")
+ # Normally, we could just use "utf8" as the encoding and "strict"
+ # as the error handler, but we better not take the risk. For
+ # example, GNU tar <= 1.23 is known to store filenames it cannot
+ # translate to UTF-8 as raw strings (unfortunately without a
+ # hdrcharset=BINARY header).
+ # We first try the strict standard encoding, and if that fails we
+ # fall back on the user's encoding and error handler.
+ keyword = self._decode_pax_field(keyword, "utf8", "utf8",
+ tarfile.errors)
+ if keyword in PAX_NAME_FIELDS:
+ value = self._decode_pax_field(value, encoding, tarfile.encoding,
+ tarfile.errors)
+ else:
+ value = self._decode_pax_field(value, "utf8", "utf8",
+ tarfile.errors)
pax_headers[keyword] = value
pos += length
# Fetch the next header.
- next = self.fromtarfile(tarfile)
+ try:
+ next = self.fromtarfile(tarfile)
+ except HeaderError:
+ raise SubsequentHeaderError("missing or bad subsequent header")
- if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
- if next is None:
- raise HeaderError("missing subsequent header")
+ # Process GNU sparse information.
+ if "GNU.sparse.map" in pax_headers:
+ # GNU extended sparse format version 0.1.
+ self._proc_gnusparse_01(next, pax_headers)
+ elif "GNU.sparse.size" in pax_headers:
+ # GNU extended sparse format version 0.0.
+ self._proc_gnusparse_00(next, pax_headers, buf)
+
+ elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
+ # GNU extended sparse format version 1.0.
+ self._proc_gnusparse_10(next, pax_headers, tarfile)
+
+ if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
# Patch the TarInfo object with the extended header info.
next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
next.offset = self.offset
@@ -1406,27 +1473,70 @@ class TarInfo(object):
return next
+ def _proc_gnusparse_00(self, next, pax_headers, buf):
+ """Process a GNU tar extended sparse header, version 0.0.
+ """
+ offsets = []
+ for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
+ offsets.append(int(match.group(1)))
+ numbytes = []
+ for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
+ numbytes.append(int(match.group(1)))
+ next.sparse = list(zip(offsets, numbytes))
+
+ def _proc_gnusparse_01(self, next, pax_headers):
+ """Process a GNU tar extended sparse header, version 0.1.
+ """
+ sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
+ next.sparse = list(zip(sparse[::2], sparse[1::2]))
+
+ def _proc_gnusparse_10(self, next, pax_headers, tarfile):
+ """Process a GNU tar extended sparse header, version 1.0.
+ """
+ fields = None
+ sparse = []
+ buf = tarfile.fileobj.read(BLOCKSIZE)
+ fields, buf = buf.split(b"\n", 1)
+ fields = int(fields)
+ while len(sparse) < fields * 2:
+ if b"\n" not in buf:
+ buf += tarfile.fileobj.read(BLOCKSIZE)
+ number, buf = buf.split(b"\n", 1)
+ sparse.append(int(number))
+ next.offset_data = tarfile.fileobj.tell()
+ next.sparse = list(zip(sparse[::2], sparse[1::2]))
+
def _apply_pax_info(self, pax_headers, encoding, errors):
"""Replace fields with supplemental information from a previous
pax extended or global header.
"""
for keyword, value in pax_headers.items():
- if keyword not in PAX_FIELDS:
- continue
-
- if keyword == "path":
- value = value.rstrip("/")
-
- if keyword in PAX_NUMBER_FIELDS:
- try:
- value = PAX_NUMBER_FIELDS[keyword](value)
- except ValueError:
- value = 0
-
- setattr(self, keyword, value)
+ if keyword == "GNU.sparse.name":
+ setattr(self, "path", value)
+ elif keyword == "GNU.sparse.size":
+ setattr(self, "size", int(value))
+ elif keyword == "GNU.sparse.realsize":
+ setattr(self, "size", int(value))
+ elif keyword in PAX_FIELDS:
+ if keyword in PAX_NUMBER_FIELDS:
+ try:
+ value = PAX_NUMBER_FIELDS[keyword](value)
+ except ValueError:
+ value = 0
+ if keyword == "path":
+ value = value.rstrip("/")
+ setattr(self, keyword, value)
self.pax_headers = pax_headers.copy()
+ def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
+ """Decode a single field from a pax record.
+ """
+ try:
+ return value.decode(encoding, "strict")
+ except UnicodeDecodeError:
+ return value.decode(fallback_encoding, fallback_errors)
+
def _block(self, count):
"""Round up a byte count by BLOCKSIZE and return it,
e.g. _block(834) => 1024.
@@ -1453,7 +1563,7 @@ class TarInfo(object):
def isfifo(self):
return self.type == FIFOTYPE
def issparse(self):
- return self.type == GNUTYPE_SPARSE
+ return self.sparse is not None
def isdev(self):
return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
# class TarInfo
@@ -1470,7 +1580,7 @@ class TarFile(object):
ignore_zeros = False # If true, skips empty or invalid blocks and
# continues processing.
- errorlevel = 0 # If 0, fatal errors only appear in debug
+ errorlevel = 1 # If 0, fatal errors only appear in debug
# messages (if debug >= 0). If > 0, errors
# are passed to the caller as exceptions.
@@ -1486,7 +1596,7 @@ class TarFile(object):
def __init__(self, name=None, mode="r", fileobj=None, format=None,
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
- errors=None, pax_headers=None, debug=None, errorlevel=None):
+ errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
"""Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
read from an existing archive, 'a' to append data to an existing
file or 'w' to create a new file overwriting an existing one. `mode'
@@ -1527,13 +1637,7 @@ class TarFile(object):
self.ignore_zeros = ignore_zeros
if encoding is not None:
self.encoding = encoding
-
- if errors is not None:
- self.errors = errors
- elif mode == "r":
- self.errors = "replace"
- else:
- self.errors = "strict"
+ self.errors = errors
if pax_headers is not None and self.format == PAX_FORMAT:
self.pax_headers = pax_headers
@@ -1562,12 +1666,16 @@ class TarFile(object):
if self.mode == "a":
# Move to the end of the archive,
# before the first empty block.
- self.firstmember = None
while True:
- if self.next() is None:
- if self.offset > 0:
- self.fileobj.seek(self.fileobj.tell() - BLOCKSIZE)
+ self.fileobj.seek(self.offset)
+ try:
+ tarinfo = self.tarinfo.fromtarfile(self)
+ self.members.append(tarinfo)
+ except EOFHeaderError:
+ self.fileobj.seek(self.offset)
break
+ except HeaderError as e:
+ raise ReadError(str(e))
if self.mode in "aw":
self._loaded = True
@@ -1655,9 +1763,12 @@ class TarFile(object):
if filemode not in "rw":
raise ValueError("mode must be 'r' or 'w'")
- t = cls(name, filemode,
- _Stream(name, filemode, comptype, fileobj, bufsize),
- **kwargs)
+ stream = _Stream(name, filemode, comptype, fileobj, bufsize)
+ try:
+ t = cls(name, filemode, stream, **kwargs)
+ except:
+ stream.close()
+ raise
t._extfileobj = False
return t
@@ -1688,16 +1799,19 @@ class TarFile(object):
except (ImportError, AttributeError):
raise CompressionError("gzip module is not available")
- if fileobj is None:
- fileobj = bltn_open(name, mode + "b")
-
+ extfileobj = fileobj is not None
try:
- t = cls.taropen(name, mode,
- gzip.GzipFile(name, mode, compresslevel, fileobj),
- **kwargs)
+ fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
+ t = cls.taropen(name, mode, fileobj, **kwargs)
except IOError:
+ if not extfileobj:
+ fileobj.close()
raise ReadError("not a gzip file")
- t._extfileobj = False
+ except:
+ if not extfileobj:
+ fileobj.close()
+ raise
+ t._extfileobj = extfileobj
return t
@classmethod
@@ -1720,7 +1834,8 @@ class TarFile(object):
try:
t = cls.taropen(name, mode, fileobj, **kwargs)
- except IOError:
+ except (IOError, EOFError):
+ fileobj.close()
raise ReadError("not a bzip2 file")
t._extfileobj = False
return t
@@ -1801,10 +1916,9 @@ class TarFile(object):
# Absolute paths are turned to relative paths.
if arcname is None:
arcname = name
- arcname = normpath(arcname)
drv, arcname = os.path.splitdrive(arcname)
- while arcname[0:1] == "/":
- arcname = arcname[1:]
+ arcname = arcname.replace(os.sep, "/")
+ arcname = arcname.lstrip("/")
# Now, fill the TarInfo object with
# information specific for the file.
@@ -1910,13 +2024,16 @@ class TarFile(object):
print("link to", tarinfo.linkname, end=' ')
print()
- def add(self, name, arcname=None, recursive=True, exclude=None):
+ def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
"""Add the file `name' to the archive. `name' may be any type of file
(directory, fifo, symbolic link, etc.). If given, `arcname'
specifies an alternative name for the file in the archive.
Directories are added recursively by default. This can be avoided by
setting `recursive' to False. `exclude' is a function that should
- return True for each filename to be excluded.
+ return True for each filename to be excluded. `filter' is a function
+ that expects a TarInfo object argument and returns the changed
+ TarInfo object, if it returns None the TarInfo object will be
+ excluded from the archive.
"""
self._check("aw")
@@ -1924,25 +2041,19 @@ class TarFile(object):
arcname = name
# Exclude pathnames.
- if exclude is not None and exclude(name):
- self._dbg(2, "tarfile: Excluded %r" % name)
- return
+ if exclude is not None:
+ import warnings
+ warnings.warn("use the filter argument instead",
+ DeprecationWarning, 2)
+ if exclude(name):
+ self._dbg(2, "tarfile: Excluded %r" % name)
+ return
# Skip if somebody tries to archive the archive...
if self.name is not None and os.path.abspath(name) == self.name:
self._dbg(2, "tarfile: Skipped %r" % name)
return
- # Special case: The user wants to add the current
- # working directory.
- if name == ".":
- if recursive:
- if arcname == ".":
- arcname = ""
- for f in os.listdir(name):
- self.add(f, os.path.join(arcname, f), recursive, exclude)
- return
-
self._dbg(1, name)
# Create a TarInfo object from the file.
@@ -1952,6 +2063,13 @@ class TarFile(object):
self._dbg(1, "tarfile: Unsupported type %r" % name)
return
+ # Change or exclude the TarInfo object.
+ if filter is not None:
+ tarinfo = filter(tarinfo)
+ if tarinfo is None:
+ self._dbg(2, "tarfile: Excluded %r" % name)
+ return
+
# Append the tar header and data to the archive.
if tarinfo.isreg():
f = bltn_open(name, "rb")
@@ -1962,7 +2080,8 @@ class TarFile(object):
self.addfile(tarinfo)
if recursive:
for f in os.listdir(name):
- self.add(os.path.join(name, f), os.path.join(arcname, f), recursive, exclude)
+ self.add(os.path.join(name, f), os.path.join(arcname, f),
+ recursive, exclude, filter=filter)
else:
self.addfile(tarinfo)
@@ -2011,7 +2130,8 @@ class TarFile(object):
directories.append(tarinfo)
tarinfo = copy.copy(tarinfo)
tarinfo.mode = 0o700
- self.extract(tarinfo, path)
+ # Do not set_attrs directories, as we will do that further down
+ self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
# Reverse sort directories.
directories.sort(key=lambda a: a.name)
@@ -2030,11 +2150,12 @@ class TarFile(object):
else:
self._dbg(1, "tarfile: %s" % e)
- def extract(self, member, path=""):
+ def extract(self, member, path="", set_attrs=True):
"""Extract a member from the archive to the current working directory,
using its full name. Its file information is extracted as accurately
as possible. `member' may be a filename or a TarInfo object. You can
- specify a different directory using `path'.
+ specify a different directory using `path'. File attributes (owner,
+ mtime, mode) are set unless `set_attrs' is False.
"""
self._check("r")
@@ -2048,7 +2169,8 @@ class TarFile(object):
tarinfo._link_target = os.path.join(path, tarinfo.linkname)
try:
- self._extract_member(tarinfo, os.path.join(path, tarinfo.name))
+ self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
+ set_attrs=set_attrs)
except EnvironmentError as e:
if self.errorlevel > 0:
raise
@@ -2095,23 +2217,21 @@ class TarFile(object):
raise StreamError("cannot extract (sym)link as file object")
else:
# A (sym)link's file object is its target's file object.
- return self.extractfile(self._getmember(tarinfo.linkname,
- tarinfo))
+ return self.extractfile(self._find_link_target(tarinfo))
else:
# If there's no data associated with the member (directory, chrdev,
# blkdev, etc.), return None instead of a file object.
return None
- def _extract_member(self, tarinfo, targetpath):
+ def _extract_member(self, tarinfo, targetpath, set_attrs=True):
"""Extract the TarInfo object tarinfo to a physical
file called targetpath.
"""
# Fetch the TarInfo object for the given name
# and build the destination pathname, replacing
# forward slashes to platform specific separators.
- if targetpath[-1:] == "/":
- targetpath = targetpath[:-1]
- targetpath = os.path.normpath(targetpath)
+ targetpath = targetpath.rstrip("/")
+ targetpath = targetpath.replace("/", os.sep)
# Create all upper directories.
upperdirs = os.path.dirname(targetpath)
@@ -2140,10 +2260,11 @@ class TarFile(object):
else:
self.makefile(tarinfo, targetpath)
- self.chown(tarinfo, targetpath)
- if not tarinfo.issym():
- self.chmod(tarinfo, targetpath)
- self.utime(tarinfo, targetpath)
+ if set_attrs:
+ self.chown(tarinfo, targetpath)
+ if not tarinfo.issym():
+ self.chmod(tarinfo, targetpath)
+ self.utime(tarinfo, targetpath)
#--------------------------------------------------------------------------
# Below are the different file methods. They are called via
@@ -2164,10 +2285,17 @@ class TarFile(object):
def makefile(self, tarinfo, targetpath):
"""Make a file called targetpath.
"""
- source = self.extractfile(tarinfo)
+ source = self.fileobj
+ source.seek(tarinfo.offset_data)
target = bltn_open(targetpath, "wb")
- copyfileobj(source, target)
- source.close()
+ if tarinfo.sparse is not None:
+ for offset, size in tarinfo.sparse:
+ target.seek(offset)
+ copyfileobj(source, target, size)
+ else:
+ copyfileobj(source, target, tarinfo.size)
+ target.seek(tarinfo.size)
+ target.truncate()
target.close()
def makeunknown(self, tarinfo, targetpath):
@@ -2206,27 +2334,29 @@ class TarFile(object):
(platform limitation), we try to make a copy of the referenced file
instead of a link.
"""
- linkpath = tarinfo.linkname
try:
+ # For systems that support symbolic and hard links.
if tarinfo.issym():
- os.symlink(linkpath, targetpath)
+ os.symlink(tarinfo.linkname, targetpath)
else:
# See extract().
- os.link(tarinfo._link_target, targetpath)
- except AttributeError:
+ if os.path.exists(tarinfo._link_target):
+ os.link(tarinfo._link_target, targetpath)
+ else:
+ self._extract_member(self._find_link_target(tarinfo),
+ targetpath)
+ except symlink_exception:
if tarinfo.issym():
linkpath = os.path.join(os.path.dirname(tarinfo.name),
- linkpath)
- linkpath = normpath(linkpath)
-
+ tarinfo.linkname)
+ else:
+ linkpath = tarinfo.linkname
+ else:
try:
- self._extract_member(self.getmember(linkpath), targetpath)
- except (EnvironmentError, KeyError) as e:
- linkpath = os.path.normpath(linkpath)
- try:
- shutil.copy2(linkpath, targetpath)
- except EnvironmentError as e:
- raise IOError("link could not be created")
+ self._extract_member(self._find_link_target(tarinfo),
+ targetpath)
+ except KeyError:
+ raise ExtractError("unable to resolve link inside archive")
def chown(self, tarinfo, targetpath):
"""Set owner of targetpath according to tarinfo.
@@ -2289,44 +2419,64 @@ class TarFile(object):
# Read the next block.
self.fileobj.seek(self.offset)
+ tarinfo = None
while True:
try:
tarinfo = self.tarinfo.fromtarfile(self)
- if tarinfo is None:
- return
- self.members.append(tarinfo)
-
- except HeaderError as e:
+ except EOFHeaderError as e:
if self.ignore_zeros:
self._dbg(2, "0x%X: %s" % (self.offset, e))
self.offset += BLOCKSIZE
continue
- else:
- if self.offset == 0:
- raise ReadError(str(e))
- return None
+ except InvalidHeaderError as e:
+ if self.ignore_zeros:
+ self._dbg(2, "0x%X: %s" % (self.offset, e))
+ self.offset += BLOCKSIZE
+ continue
+ elif self.offset == 0:
+ raise ReadError(str(e))
+ except EmptyHeaderError:
+ if self.offset == 0:
+ raise ReadError("empty file")
+ except TruncatedHeaderError as e:
+ if self.offset == 0:
+ raise ReadError(str(e))
+ except SubsequentHeaderError as e:
+ raise ReadError(str(e))
break
+ if tarinfo is not None:
+ self.members.append(tarinfo)
+ else:
+ self._loaded = True
+
return tarinfo
#--------------------------------------------------------------------------
# Little helper methods:
- def _getmember(self, name, tarinfo=None):
+ def _getmember(self, name, tarinfo=None, normalize=False):
"""Find an archive member by name from bottom to top.
If tarinfo is given, it is used as the starting point.
"""
# Ensure that all members have been loaded.
members = self.getmembers()
- if tarinfo is None:
- end = len(members)
- else:
- end = members.index(tarinfo)
+ # Limit the member search list up to tarinfo.
+ if tarinfo is not None:
+ members = members[:members.index(tarinfo)]
- for i in range(end - 1, -1, -1):
- if name == members[i].name:
- return members[i]
+ if normalize:
+ name = os.path.normpath(name)
+
+ for member in reversed(members):
+ if normalize:
+ member_name = os.path.normpath(member.name)
+ else:
+ member_name = member.name
+
+ if name == member_name:
+ return member
def _load(self):
"""Read through the entire archive file and look for readable
@@ -2347,6 +2497,25 @@ class TarFile(object):
if mode is not None and self.mode not in mode:
raise IOError("bad operation for mode %r" % self.mode)
+ def _find_link_target(self, tarinfo):
+ """Find the target member of a symlink or hardlink member in the
+ archive.
+ """
+ if tarinfo.issym():
+ # Always search the entire archive.
+ linkname = os.path.dirname(tarinfo.name) + "/" + tarinfo.linkname
+ limit = None
+ else:
+ # Search the archive before the link, because a hard link is
+ # just a reference to an already archived file.
+ linkname = tarinfo.linkname
+ limit = tarinfo
+
+ member = self._getmember(linkname, tarinfo=limit, normalize=True)
+ if member is None:
+ raise KeyError("linkname %r not found" % linkname)
+ return member
+
def __iter__(self):
"""Provide an iterator object.
"""
@@ -2360,6 +2529,20 @@ class TarFile(object):
"""
if level <= self.debug:
print(msg, file=sys.stderr)
+
+ def __enter__(self):
+ self._check()
+ return self
+
+ def __exit__(self, type, value, traceback):
+ if type is None:
+ self.close()
+ else:
+ # An exception occurred. We must not call close() because
+ # it would try to write end-of-archive blocks and padding.
+ if not self._extfileobj:
+ self.fileobj.close()
+ self.closed = True
# class TarFile
class TarIter:
@@ -2398,49 +2581,6 @@ class TarIter:
self.index += 1
return tarinfo
-# Helper classes for sparse file support
-class _section:
- """Base class for _data and _hole.
- """
- def __init__(self, offset, size):
- self.offset = offset
- self.size = size
- def __contains__(self, offset):
- return self.offset <= offset < self.offset + self.size
-
-class _data(_section):
- """Represent a data section in a sparse file.
- """
- def __init__(self, offset, size, realpos):
- _section.__init__(self, offset, size)
- self.realpos = realpos
-
-class _hole(_section):
- """Represent a hole section in a sparse file.
- """
- pass
-
-class _ringbuffer(list):
- """Ringbuffer class which increases performance
- over a regular list.
- """
- def __init__(self):
- self.idx = 0
- def find(self, offset):
- idx = self.idx
- while True:
- item = self[idx]
- if offset in item:
- break
- idx += 1
- if idx == len(self):
- idx = 0
- if idx == self.idx:
- # End of File
- return None
- self.idx = idx
- return item
-
#--------------------
# exported functions
#--------------------