diff options
author | Lars Gustäbel <lars@gustaebel.de> | 2007-03-13 10:47:19 +0000 |
---|---|---|
committer | Lars Gustäbel <lars@gustaebel.de> | 2007-03-13 10:47:19 +0000 |
commit | c64e40215d556df635768f56f76d35c2bba7b300 (patch) | |
tree | f1c17a093df831096d2521037e79a1ff4ed87456 /Lib/tarfile.py | |
parent | bdd0f39de559ddedc9d952020da71a8a6973c34c (diff) | |
download | cpython-git-c64e40215d556df635768f56f76d35c2bba7b300.tar.gz |
This is the implementation of POSIX.1-2001 (pax) format read/write
support.
The TarInfo class now contains all necessary logic to process and
create tar header data which has been moved there from the TarFile
class. The fromtarfile() method was added. The new path and linkpath
properties are aliases for the name and linkname attributes in
correspondence to the pax naming scheme.
The TarFile constructor and classmethods now accept a number of
keyword arguments which could only be set as attributes before (e.g.
dereference, ignore_zeros). The encoding and pax_headers arguments
were added for pax support. There is a new tarinfo keyword argument
that allows using subclassed TarInfo objects in TarFile.
The boolean TarFile.posix attribute is deprecated, because now three
tar formats are supported. Instead, the desired format for writing is
specified using the constants USTAR_FORMAT, GNU_FORMAT and PAX_FORMAT
as the format keyword argument. This change affects TarInfo.tobuf()
as well.
The test suite has been heavily reorganized and partially rewritten.
A new testtar.tar was added that contains sample data in many formats
from 4 different tar programs.
Some bugs and quirks that also have been fixed:
Directory names do no longer have a trailing slash in TarInfo.name or
TarFile.getnames().
Adding the same file twice does not create a hardlink file member.
The TarFile constructor does no longer need a name argument.
The TarFile._mode attribute was renamed to mode and contains either
'r', 'w' or 'a'.
Diffstat (limited to 'Lib/tarfile.py')
-rw-r--r-- | Lib/tarfile.py | 980 |
1 files changed, 646 insertions, 334 deletions
diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 54bb1b85b8..b6dc3ee4b3 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -33,7 +33,7 @@ __version__ = "$Revision$" # $Source$ -version = "0.8.0" +version = "0.9.0" __author__ = "Lars Gustäbel (lars@gustaebel.de)" __date__ = "$Date$" __cvsid__ = "$Id$" @@ -50,6 +50,7 @@ import errno import time import struct import copy +import re if sys.platform == 'mac': # This module needs work for MacOS9, especially in the area of pathname @@ -69,42 +70,60 @@ __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"] #--------------------------------------------------------- # tar constants #--------------------------------------------------------- -NUL = "\0" # the null character -BLOCKSIZE = 512 # length of processing blocks +NUL = "\0" # the null character +BLOCKSIZE = 512 # length of processing blocks RECORDSIZE = BLOCKSIZE * 20 # length of records -MAGIC = "ustar" # magic tar string -VERSION = "00" # version number +GNU_MAGIC = "ustar \0" # magic gnu tar string +POSIX_MAGIC = "ustar\x0000" # magic posix tar string -LENGTH_NAME = 100 # maximum length of a filename -LENGTH_LINK = 100 # maximum length of a linkname -LENGTH_PREFIX = 155 # maximum length of the prefix field -MAXSIZE_MEMBER = 077777777777L # maximum size of a file (11 octal digits) +LENGTH_NAME = 100 # maximum length of a filename +LENGTH_LINK = 100 # maximum length of a linkname +LENGTH_PREFIX = 155 # maximum length of the prefix field -REGTYPE = "0" # regular file +REGTYPE = "0" # regular file AREGTYPE = "\0" # regular file -LNKTYPE = "1" # link (inside tarfile) -SYMTYPE = "2" # symbolic link -CHRTYPE = "3" # character special device -BLKTYPE = "4" # block special device -DIRTYPE = "5" # directory +LNKTYPE = "1" # link (inside tarfile) +SYMTYPE = "2" # symbolic link +CHRTYPE = "3" # character special device +BLKTYPE = "4" # block special device +DIRTYPE = "5" # directory FIFOTYPE = "6" # fifo special device CONTTYPE = "7" # contiguous file -GNUTYPE_LONGNAME = "L" # GNU tar extension for longnames -GNUTYPE_LONGLINK = "K" # GNU tar extension for longlink -GNUTYPE_SPARSE = "S" # GNU tar extension for sparse file +GNUTYPE_LONGNAME = "L" # GNU tar longname +GNUTYPE_LONGLINK = "K" # GNU tar longlink +GNUTYPE_SPARSE = "S" # GNU tar sparse file + +XHDTYPE = "x" # POSIX.1-2001 extended header +XGLTYPE = "g" # POSIX.1-2001 global header +SOLARIS_XHDTYPE = "X" # Solaris extended header + +USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format +GNU_FORMAT = 1 # GNU tar format +PAX_FORMAT = 2 # POSIX.1-2001 (pax) format +DEFAULT_FORMAT = GNU_FORMAT #--------------------------------------------------------- # tarfile constants #--------------------------------------------------------- -SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, # file types that tarfile - SYMTYPE, DIRTYPE, FIFOTYPE, # can cope with. +# File types that tarfile supports: +SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, + SYMTYPE, DIRTYPE, FIFOTYPE, CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, GNUTYPE_SPARSE) -REGULAR_TYPES = (REGTYPE, AREGTYPE, # file types that somehow - CONTTYPE, GNUTYPE_SPARSE) # represent regular files +# File types that will be treated as a regular file. +REGULAR_TYPES = (REGTYPE, AREGTYPE, + CONTTYPE, GNUTYPE_SPARSE) + +# File types that are part of the GNU tar format. +GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, + GNUTYPE_SPARSE) + +# Fields from a pax header that override a TarInfo attribute. +PAX_FIELDS = ("path", "linkpath", "size", "mtime", + "uid", "gid", "uname", "gname") #--------------------------------------------------------- # Bits used in the mode field, values in octal. @@ -131,6 +150,13 @@ TOWRITE = 0002 # write by other TOEXEC = 0001 # execute/search by other #--------------------------------------------------------- +# initialization +#--------------------------------------------------------- +ENCODING = sys.getfilesystemencoding() +if ENCODING is None: + ENCODING = "ascii" + +#--------------------------------------------------------- # Some useful functions #--------------------------------------------------------- @@ -139,6 +165,15 @@ def stn(s, length): """ return s[:length] + (length - len(s)) * NUL +def nts(s): + """Convert a null-terminated string field to a python string. + """ + # Use the string up to the first null char. + p = s.find("\0") + if p == -1: + return s + return s[:p] + def nti(s): """Convert a number field to a python number. """ @@ -146,7 +181,7 @@ def nti(s): # itn() below. if s[0] != chr(0200): try: - n = int(s.rstrip(NUL + " ") or "0", 8) + n = int(nts(s) or "0", 8) except ValueError: raise HeaderError("invalid header") else: @@ -156,7 +191,7 @@ def nti(s): n += ord(s[i + 1]) return n -def itn(n, digits=8, posix=False): +def itn(n, digits=8, format=DEFAULT_FORMAT): """Convert a python number to a number field. """ # POSIX 1003.1-1988 requires numbers to be encoded as a string of @@ -168,7 +203,7 @@ def itn(n, digits=8, posix=False): if 0 <= n < 8 ** (digits - 1): s = "%0*o" % (digits - 1, n) + NUL else: - if posix: + if format != GNU_FORMAT or n >= 256 ** (digits - 1): raise ValueError("overflow in number field") if n < 0: @@ -514,7 +549,10 @@ class _Stream: buf = self.__read(self.bufsize) if not buf: break - buf = self.cmp.decompress(buf) + try: + buf = self.cmp.decompress(buf) + except IOError: + raise ReadError("invalid compressed data") t.append(buf) c += len(buf) t = "".join(t) @@ -575,6 +613,7 @@ class _BZ2Proxy(object): def __init__(self, fileobj, mode): self.fileobj = fileobj self.mode = mode + self.name = getattr(self.fileobj, "name", None) self.init() def init(self): @@ -847,8 +886,8 @@ class TarInfo(object): """Construct a TarInfo object. name is the optional name of the member. """ - self.name = name # member name (dirnames must end with '/') - self.mode = 0666 # file permissions + self.name = name # member name + self.mode = 0644 # file permissions self.uid = 0 # user id self.gid = 0 # group id self.size = 0 # file size @@ -856,17 +895,274 @@ class TarInfo(object): self.chksum = 0 # header checksum self.type = REGTYPE # member type self.linkname = "" # link name - self.uname = "user" # user name - self.gname = "group" # group name + self.uname = "root" # user name + self.gname = "root" # group name self.devmajor = 0 # device major number self.devminor = 0 # device minor number self.offset = 0 # the tar header starts here self.offset_data = 0 # the file's data starts here + self.pax_headers = {} # pax header information + + # In pax headers the "name" and "linkname" field are called + # "path" and "linkpath". + def _getpath(self): + return self.name + def _setpath(self, name): + self.name = name + path = property(_getpath, _setpath) + + def _getlinkpath(self): + return self.linkname + def _setlinkpath(self, linkname): + self.linkname = linkname + linkpath = property(_getlinkpath, _setlinkpath) + def __repr__(self): return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) + def get_info(self): + """Return the TarInfo's attributes as a dictionary. + """ + info = { + "name": normpath(self.name), + "mode": self.mode & 07777, + "uid": self.uid, + "gid": self.gid, + "size": self.size, + "mtime": self.mtime, + "chksum": self.chksum, + "type": self.type, + "linkname": normpath(self.linkname) if self.linkname else "", + "uname": self.uname, + "gname": self.gname, + "devmajor": self.devmajor, + "devminor": self.devminor + } + + if info["type"] == DIRTYPE and not info["name"].endswith("/"): + info["name"] += "/" + + return info + + def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING): + """Return a tar header as a string of 512 byte blocks. + """ + if format == USTAR_FORMAT: + return self.create_ustar_header() + elif format == GNU_FORMAT: + return self.create_gnu_header() + elif format == PAX_FORMAT: + return self.create_pax_header(encoding) + else: + raise ValueError("invalid format") + + def create_ustar_header(self): + """Return the object as a ustar header block. + """ + info = self.get_info() + info["magic"] = POSIX_MAGIC + + if len(info["linkname"]) > LENGTH_LINK: + raise ValueError("linkname is too long") + + if len(info["name"]) > LENGTH_NAME: + info["prefix"], info["name"] = self._posix_split_name(info["name"]) + + return self._create_header(info, USTAR_FORMAT) + + def create_gnu_header(self): + """Return the object as a GNU header block sequence. + """ + info = self.get_info() + info["magic"] = GNU_MAGIC + + buf = "" + if len(info["linkname"]) > LENGTH_LINK: + buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK) + + if len(info["name"]) > LENGTH_NAME: + buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME) + + return buf + self._create_header(info, GNU_FORMAT) + + def create_pax_header(self, encoding): + """Return the object as a ustar header block. If it cannot be + represented this way, prepend a pax extended header sequence + with supplement information. + """ + info = self.get_info() + info["magic"] = POSIX_MAGIC + pax_headers = self.pax_headers.copy() + + # Test string fields for values that exceed the field length or cannot + # be represented in ASCII encoding. + for name, hname, length in ( + ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), + ("uname", "uname", 32), ("gname", "gname", 32)): + + val = info[name].decode(encoding) + + # Try to encode the string as ASCII. + try: + val.encode("ascii") + except UnicodeEncodeError: + pax_headers[hname] = val + continue + + if len(val) > length: + if name == "name": + # Try to squeeze a longname in the prefix and name fields as in + # ustar format. + try: + info["prefix"], info["name"] = self._posix_split_name(info["name"]) + except ValueError: + pax_headers[hname] = val + else: + continue + else: + pax_headers[hname] = val + + # Test number fields for values that exceed the field limit or values + # that like to be stored as float. + for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): + val = info[name] + if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): + pax_headers[name] = unicode(val) + info[name] = 0 + + if pax_headers: + buf = self._create_pax_generic_header(pax_headers) + else: + buf = "" + + return buf + self._create_header(info, USTAR_FORMAT) + + @classmethod + def create_pax_global_header(cls, pax_headers, encoding): + """Return the object as a pax global header block sequence. + """ + new_headers = {} + for key, val in pax_headers.iteritems(): + key = cls._to_unicode(key, encoding) + val = cls._to_unicode(val, encoding) + new_headers[key] = val + return cls._create_pax_generic_header(new_headers, type=XGLTYPE) + + @staticmethod + def _to_unicode(value, encoding): + if isinstance(value, unicode): + return value + elif isinstance(value, (int, long, float)): + return unicode(value) + elif isinstance(value, str): + return unicode(value, encoding) + else: + raise ValueError("unable to convert to unicode: %r" % value) + + def _posix_split_name(self, name): + """Split a name longer than 100 chars into a prefix + and a name part. + """ + prefix = name[:LENGTH_PREFIX + 1] + while prefix and prefix[-1] != "/": + prefix = prefix[:-1] + + name = name[len(prefix):] + prefix = prefix[:-1] + + if not prefix or len(name) > LENGTH_NAME: + raise ValueError("name is too long") + return prefix, name + + @staticmethod + def _create_header(info, format): + """Return a header block. info is a dictionary with file + information, format must be one of the *_FORMAT constants. + """ + parts = [ + stn(info.get("name", ""), 100), + itn(info.get("mode", 0) & 07777, 8, format), + itn(info.get("uid", 0), 8, format), + itn(info.get("gid", 0), 8, format), + itn(info.get("size", 0), 12, format), + itn(info.get("mtime", 0), 12, format), + " ", # checksum field + info.get("type", REGTYPE), + stn(info.get("linkname", ""), 100), + stn(info.get("magic", ""), 8), + stn(info.get("uname", ""), 32), + stn(info.get("gname", ""), 32), + itn(info.get("devmajor", 0), 8, format), + itn(info.get("devminor", 0), 8, format), + stn(info.get("prefix", ""), 155) + ] + + buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts)) + chksum = calc_chksums(buf[-BLOCKSIZE:])[0] + buf = buf[:-364] + "%06o\0" % chksum + buf[-357:] + return buf + + @staticmethod + def _create_payload(payload): + """Return the string payload filled with zero bytes + up to the next 512 byte border. + """ + blocks, remainder = divmod(len(payload), BLOCKSIZE) + if remainder > 0: + payload += (BLOCKSIZE - remainder) * NUL + return payload + + @classmethod + def _create_gnu_long_header(cls, name, type): + """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence + for name. + """ + name += NUL + + info = {} + info["name"] = "././@LongLink" + info["type"] = type + info["size"] = len(name) + info["magic"] = GNU_MAGIC + + # create extended header + name blocks. + return cls._create_header(info, USTAR_FORMAT) + \ + cls._create_payload(name) + + @classmethod + def _create_pax_generic_header(cls, pax_headers, type=XHDTYPE): + """Return a POSIX.1-2001 extended or global header sequence + that contains a list of keyword, value pairs. The values + must be unicode objects. + """ + records = [] + for keyword, value in pax_headers.iteritems(): + keyword = keyword.encode("utf8") + value = value.encode("utf8") + l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' + n = p = 0 + while True: + n = l + len(str(p)) + if n == p: + break + p = n + records.append("%d %s=%s\n" % (p, keyword, value)) + records = "".join(records) + + # We use a hardcoded "././@PaxHeader" name like star does + # instead of the one that POSIX recommends. + info = {} + info["name"] = "././@PaxHeader" + info["type"] = type + info["size"] = len(records) + info["magic"] = POSIX_MAGIC + + # Create pax header + record blocks. + return cls._create_header(info, USTAR_FORMAT) + \ + cls._create_payload(records) + @classmethod def frombuf(cls, buf): """Construct a TarInfo object from a 512 byte string buffer. @@ -880,125 +1176,251 @@ class TarInfo(object): if chksum not in calc_chksums(buf): raise HeaderError("bad checksum") - tarinfo = cls() - tarinfo.buf = buf - tarinfo.name = buf[0:100].rstrip(NUL) - tarinfo.mode = nti(buf[100:108]) - tarinfo.uid = nti(buf[108:116]) - tarinfo.gid = nti(buf[116:124]) - tarinfo.size = nti(buf[124:136]) - tarinfo.mtime = nti(buf[136:148]) - tarinfo.chksum = chksum - tarinfo.type = buf[156:157] - tarinfo.linkname = buf[157:257].rstrip(NUL) - tarinfo.uname = buf[265:297].rstrip(NUL) - tarinfo.gname = buf[297:329].rstrip(NUL) - tarinfo.devmajor = nti(buf[329:337]) - tarinfo.devminor = nti(buf[337:345]) - prefix = buf[345:500].rstrip(NUL) - - if prefix and not tarinfo.issparse(): - tarinfo.name = prefix + "/" + tarinfo.name + obj = cls() + obj.buf = buf + obj.name = nts(buf[0:100]) + obj.mode = nti(buf[100:108]) + obj.uid = nti(buf[108:116]) + obj.gid = nti(buf[116:124]) + obj.size = nti(buf[124:136]) + obj.mtime = nti(buf[136:148]) + obj.chksum = chksum + obj.type = buf[156:157] + obj.linkname = nts(buf[157:257]) + obj.uname = nts(buf[265:297]) + obj.gname = nts(buf[297:329]) + obj.devmajor = nti(buf[329:337]) + obj.devminor = nti(buf[337:345]) + prefix = nts(buf[345:500]) + + # Old V7 tar format represents a directory as a regular + # file with a trailing slash. + if obj.type == AREGTYPE and obj.name.endswith("/"): + obj.type = DIRTYPE - return tarinfo + # Remove redundant slashes from directories. + if obj.isdir(): + obj.name = obj.name.rstrip("/") - def tobuf(self, posix=False): - """Return a tar header as a string of 512 byte blocks. - """ - buf = "" - type = self.type - prefix = "" + # Reconstruct a ustar longname. + if prefix and obj.type not in GNU_TYPES: + obj.name = prefix + "/" + obj.name + return obj - if self.name.endswith("/"): - type = DIRTYPE + @classmethod + def fromtarfile(cls, tarfile): + """Return the next TarInfo object from TarFile object + tarfile. + """ + buf = tarfile.fileobj.read(BLOCKSIZE) + if not buf: + return + obj = cls.frombuf(buf) + obj.offset = tarfile.fileobj.tell() - BLOCKSIZE + return obj._proc_member(tarfile) - if type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): - # Prevent "././@LongLink" from being normalized. - name = self.name + #-------------------------------------------------------------------------- + # The following are methods that are called depending on the type of a + # member. The entry point is _proc_member() which can be overridden in a + # subclass to add custom _proc_*() methods. A _proc_*() method MUST + # implement the following + # operations: + # 1. Set self.offset_data to the position where the data blocks begin, + # if there is data that follows. + # 2. Set tarfile.offset to the position where the next member's header will + # begin. + # 3. Return self or another valid TarInfo object. + def _proc_member(self, tarfile): + """Choose the right processing method depending on + the type and call it. + """ + if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): + return self._proc_gnulong(tarfile) + elif self.type == GNUTYPE_SPARSE: + return self._proc_sparse(tarfile) + elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): + return self._proc_pax(tarfile) else: - name = normpath(self.name) + return self._proc_builtin(tarfile) - if type == DIRTYPE: - # directories should end with '/' - name += "/" + def _proc_builtin(self, tarfile): + """Process a builtin type or an unknown type which + will be treated as a regular file. + """ + self.offset_data = tarfile.fileobj.tell() + offset = self.offset_data + if self.isreg() or self.type not in SUPPORTED_TYPES: + # Skip the following data blocks. + offset += self._block(self.size) + tarfile.offset = offset - linkname = self.linkname - if linkname: - # if linkname is empty we end up with a '.' - linkname = normpath(linkname) + # Patch the TarInfo object with saved extended + # header information. + for keyword, value in tarfile.pax_headers.iteritems(): + if keyword in PAX_FIELDS: + setattr(self, keyword, value) + self.pax_headers[keyword] = value - if posix: - if self.size > MAXSIZE_MEMBER: - raise ValueError("file is too large (>= 8 GB)") + return self - if len(self.linkname) > LENGTH_LINK: - raise ValueError("linkname is too long (>%d)" % (LENGTH_LINK)) + def _proc_gnulong(self, tarfile): + """Process the blocks that hold a GNU longname + or longlink member. + """ + buf = tarfile.fileobj.read(self._block(self.size)) - if len(name) > LENGTH_NAME: - prefix = name[:LENGTH_PREFIX + 1] - while prefix and prefix[-1] != "/": - prefix = prefix[:-1] + # Fetch the next header and process it. + b = tarfile.fileobj.read(BLOCKSIZE) + t = self.frombuf(b) + t.offset = self.offset + next = t._proc_member(tarfile) - name = name[len(prefix):] - prefix = prefix[:-1] + # Patch the TarInfo object from the next header with + # the longname information. + next.offset = self.offset + if self.type == GNUTYPE_LONGNAME: + next.name = buf.rstrip(NUL) + elif self.type == GNUTYPE_LONGLINK: + next.linkname = buf.rstrip(NUL) - if not prefix or len(name) > LENGTH_NAME: - raise ValueError("name is too long") + return next - else: - if len(self.linkname) > LENGTH_LINK: - buf += self._create_gnulong(self.linkname, GNUTYPE_LONGLINK) + def _proc_sparse(self, tarfile): + """Process a GNU sparse header plus extra headers. + """ + buf = self.buf + sp = _ringbuffer() + pos = 386 + lastpos = 0L + realpos = 0L + # There are 4 possible sparse structs in the + # first header. + for i in xrange(4): + try: + offset = nti(buf[pos:pos + 12]) + numbytes = nti(buf[pos + 12:pos + 24]) + except ValueError: + break + if offset > lastpos: + sp.append(_hole(lastpos, offset - lastpos)) + sp.append(_data(offset, numbytes, realpos)) + realpos += numbytes + lastpos = offset + numbytes + pos += 24 - if len(name) > LENGTH_NAME: - buf += self._create_gnulong(name, GNUTYPE_LONGNAME) + isextended = ord(buf[482]) + origsize = nti(buf[483:495]) - parts = [ - stn(name, 100), - itn(self.mode & 07777, 8, posix), - itn(self.uid, 8, posix), - itn(self.gid, 8, posix), - itn(self.size, 12, posix), - itn(self.mtime, 12, posix), - " ", # checksum field - type, - stn(self.linkname, 100), - stn(MAGIC, 6), - stn(VERSION, 2), - stn(self.uname, 32), - stn(self.gname, 32), - itn(self.devmajor, 8, posix), - itn(self.devminor, 8, posix), - stn(prefix, 155) - ] + # If the isextended flag is given, + # there are extra headers to process. + while isextended == 1: + buf = tarfile.fileobj.read(BLOCKSIZE) + pos = 0 + for i in xrange(21): + try: + offset = nti(buf[pos:pos + 12]) + numbytes = nti(buf[pos + 12:pos + 24]) + except ValueError: + break + if offset > lastpos: + sp.append(_hole(lastpos, offset - lastpos)) + sp.append(_data(offset, numbytes, realpos)) + realpos += numbytes + lastpos = offset + numbytes + pos += 24 + isextended = ord(buf[504]) - buf += struct.pack("%ds" % BLOCKSIZE, "".join(parts)) - chksum = calc_chksums(buf[-BLOCKSIZE:])[0] - buf = buf[:-364] + "%06o\0" % chksum + buf[-357:] - self.buf = buf - return buf + if lastpos < origsize: + sp.append(_hole(lastpos, origsize - lastpos)) + + self.sparse = sp - def _create_gnulong(self, name, type): - """Create a GNU longname/longlink header from name. - It consists of an extended tar header, with the length - of the longname as size, followed by data blocks, - which contain the longname as a null terminated string. + self.offset_data = tarfile.fileobj.tell() + tarfile.offset = self.offset_data + self._block(self.size) + self.size = origsize + + return self + + def _proc_pax(self, tarfile): + """Process an extended or global header as described in + POSIX.1-2001. """ - name += NUL + # Read the header information. + buf = tarfile.fileobj.read(self._block(self.size)) - tarinfo = self.__class__() - tarinfo.name = "././@LongLink" - tarinfo.type = type - tarinfo.mode = 0 - tarinfo.size = len(name) - - # create extended header - buf = tarinfo.tobuf() - # create name blocks - buf += name - blocks, remainder = divmod(len(name), BLOCKSIZE) - if remainder > 0: - buf += (BLOCKSIZE - remainder) * NUL - return buf + # A pax header stores supplemental information for either + # the following file (extended) or all following files + # (global). + if self.type == XGLTYPE: + pax_headers = tarfile.pax_headers + else: + pax_headers = tarfile.pax_headers.copy() + + # Fields in POSIX.1-2001 that are numbers, all other fields + # are treated as UTF-8 strings. + type_mapping = { + "atime": float, + "ctime": float, + "mtime": float, + "uid": int, + "gid": int, + "size": int + } + + # Parse pax header information. A record looks like that: + # "%d %s=%s\n" % (length, keyword, value). length is the size + # of the complete record including the length field itself and + # the newline. + regex = re.compile(r"(\d+) ([^=]+)=", re.U) + pos = 0 + while True: + match = regex.match(buf, pos) + if not match: + break + + length, keyword = match.groups() + length = int(length) + value = buf[match.end(2) + 1:match.start(1) + length - 1] + + keyword = keyword.decode("utf8") + keyword = keyword.encode(tarfile.encoding) + + value = value.decode("utf8") + if keyword in type_mapping: + try: + value = type_mapping[keyword](value) + except ValueError: + value = 0 + else: + value = value.encode(tarfile.encoding) + + pax_headers[keyword] = value + pos += length + + # Fetch the next header that will be patched with the + # supplement information from the pax header (extended + # only). + t = self.fromtarfile(tarfile) + + if self.type != XGLTYPE and t is not None: + # Patch the TarInfo object from the next header with + # the pax header's information. + for keyword, value in pax_headers.items(): + if keyword in PAX_FIELDS: + setattr(t, keyword, value) + pax_headers[keyword] = value + t.pax_headers = pax_headers.copy() + + return t + + def _block(self, count): + """Round up a byte count by BLOCKSIZE and return it, + e.g. _block(834) => 1024. + """ + blocks, remainder = divmod(count, BLOCKSIZE) + if remainder: + blocks += 1 + return blocks * BLOCKSIZE def isreg(self): return self.type in REGULAR_TYPES @@ -1038,12 +1460,18 @@ class TarFile(object): # messages (if debug >= 0). If > 0, errors # are passed to the caller as exceptions. - posix = False # If True, generates POSIX.1-1990-compliant - # archives (no GNU extensions!) + format = DEFAULT_FORMAT # The format to use when creating an archive. + + encoding = ENCODING # Transfer UTF-8 strings from POSIX.1-2001 + # headers to this encoding. + + tarinfo = TarInfo # The default TarInfo class to use. - fileobject = ExFileObject + fileobject = ExFileObject # The default ExFileObject class to use. - def __init__(self, name=None, mode="r", fileobj=None): + def __init__(self, name=None, mode="r", fileobj=None, format=None, + tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, + pax_headers=None, debug=None, errorlevel=None): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' @@ -1052,58 +1480,86 @@ class TarFile(object): can be determined, `mode' is overridden by `fileobj's mode. `fileobj' is not closed, when TarFile is closed. """ - self.name = os.path.abspath(name) - if len(mode) > 1 or mode not in "raw": raise ValueError("mode must be 'r', 'a' or 'w'") - self._mode = mode - self.mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode] + self.mode = mode + self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode] if not fileobj: - if self._mode == "a" and not os.path.exists(self.name): + if self.mode == "a" and not os.path.exists(name): # Create nonexistent files in append mode. - self._mode = "w" - self.mode = "wb" - fileobj = file(self.name, self.mode) + self.mode = "w" + self._mode = "wb" + fileobj = file(name, self._mode) self._extfileobj = False else: - if self.name is None and hasattr(fileobj, "name"): - self.name = os.path.abspath(fileobj.name) + if name is None and hasattr(fileobj, "name"): + name = fileobj.name if hasattr(fileobj, "mode"): - self.mode = fileobj.mode + self._mode = fileobj.mode self._extfileobj = True + self.name = os.path.abspath(name) self.fileobj = fileobj - # Init datastructures + # Init attributes. + if format is not None: + self.format = format + if tarinfo is not None: + self.tarinfo = tarinfo + if dereference is not None: + self.dereference = dereference + if ignore_zeros is not None: + self.ignore_zeros = ignore_zeros + if encoding is not None: + self.encoding = encoding + if debug is not None: + self.debug = debug + if errorlevel is not None: + self.errorlevel = errorlevel + + # Init datastructures. self.closed = False self.members = [] # list of members as TarInfo objects self._loaded = False # flag if all members have been read self.offset = 0L # current position in the archive file self.inodes = {} # dictionary caching the inodes of # archive members already added + self.pax_headers = {} # save contents of global pax headers - if self._mode == "r": + if self.mode == "r": self.firstmember = None self.firstmember = self.next() - if self._mode == "a": + if self.mode == "a": # Move to the end of the archive, # before the first empty block. self.firstmember = None while True: - try: - tarinfo = self.next() - except ReadError: - self.fileobj.seek(0) - break - if tarinfo is None: + if self.next() is None: if self.offset > 0: self.fileobj.seek(- BLOCKSIZE, 1) break - if self._mode in "aw": + if self.mode in "aw": self._loaded = True + if pax_headers: + buf = self.tarinfo.create_pax_global_header( + pax_headers.copy(), self.encoding) + self.fileobj.write(buf) + self.offset += len(buf) + + def _getposix(self): + return self.format == USTAR_FORMAT + def _setposix(self, value): + import warnings + warnings.warn("use the format attribute instead", DeprecationWarning) + if value: + self.format = USTAR_FORMAT + else: + self.format = GNU_FORMAT + posix = property(_getposix, _setposix) + #-------------------------------------------------------------------------- # Below are the classmethods which act as alternate constructors to the # TarFile class. The open() method is the only one that is needed for @@ -1116,7 +1572,7 @@ class TarFile(object): # by adding it to the mapping in OPEN_METH. @classmethod - def open(cls, name=None, mode="r", fileobj=None, bufsize=20*512): + def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): """Open a tar archive for reading, writing or appending. Return an appropriate TarFile class. @@ -1149,8 +1605,8 @@ class TarFile(object): if fileobj is not None: saved_pos = fileobj.tell() try: - return func(name, "r", fileobj) - except (ReadError, CompressionError): + return func(name, "r", fileobj, **kwargs) + except (ReadError, CompressionError), e: if fileobj is not None: fileobj.seek(saved_pos) continue @@ -1167,7 +1623,7 @@ class TarFile(object): func = getattr(cls, cls.OPEN_METH[comptype]) else: raise CompressionError("unknown compression type %r" % comptype) - return func(name, filemode, fileobj) + return func(name, filemode, fileobj, **kwargs) elif "|" in mode: filemode, comptype = mode.split("|", 1) @@ -1178,25 +1634,26 @@ class TarFile(object): raise ValueError("mode must be 'r' or 'w'") t = cls(name, filemode, - _Stream(name, filemode, comptype, fileobj, bufsize)) + _Stream(name, filemode, comptype, fileobj, bufsize), + **kwargs) t._extfileobj = False return t elif mode in "aw": - return cls.taropen(name, mode, fileobj) + return cls.taropen(name, mode, fileobj, **kwargs) raise ValueError("undiscernible mode") @classmethod - def taropen(cls, name, mode="r", fileobj=None): + def taropen(cls, name, mode="r", fileobj=None, **kwargs): """Open uncompressed tar archive name for reading or writing. """ if len(mode) > 1 or mode not in "raw": raise ValueError("mode must be 'r', 'a' or 'w'") - return cls(name, mode, fileobj) + return cls(name, mode, fileobj, **kwargs) @classmethod - def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9): + def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): """Open gzip compressed tar archive name for reading or writing. Appending is not allowed. """ @@ -1214,14 +1671,15 @@ class TarFile(object): try: t = cls.taropen(name, mode, - gzip.GzipFile(name, mode, compresslevel, fileobj)) + gzip.GzipFile(name, mode, compresslevel, fileobj), + **kwargs) except IOError: raise ReadError("not a gzip file") t._extfileobj = False return t @classmethod - def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9): + def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): """Open bzip2 compressed tar archive name for reading or writing. Appending is not allowed. """ @@ -1239,7 +1697,7 @@ class TarFile(object): fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel) try: - t = cls.taropen(name, mode, fileobj) + t = cls.taropen(name, mode, fileobj, **kwargs) except IOError: raise ReadError("not a bzip2 file") t._extfileobj = False @@ -1262,7 +1720,7 @@ class TarFile(object): if self.closed: return - if self._mode in "aw": + if self.mode in "aw": self.fileobj.write(NUL * (BLOCKSIZE * 2)) self.offset += (BLOCKSIZE * 2) # fill up the end with zero-blocks @@ -1328,7 +1786,8 @@ class TarFile(object): # Now, fill the TarInfo object with # information specific for the file. - tarinfo = TarInfo() + tarinfo = self.tarinfo() + tarinfo.tarfile = self # Use os.stat or os.lstat, depending on platform # and if symlinks shall be resolved. @@ -1344,8 +1803,8 @@ class TarFile(object): stmd = statres.st_mode if stat.S_ISREG(stmd): inode = (statres.st_ino, statres.st_dev) - if not self.dereference and \ - statres.st_nlink > 1 and inode in self.inodes: + if not self.dereference and statres.st_nlink > 1 and \ + inode in self.inodes and arcname != self.inodes[inode]: # Is it a hardlink to an already # archived file? type = LNKTYPE @@ -1422,7 +1881,7 @@ class TarFile(object): print "%d-%02d-%02d %02d:%02d:%02d" \ % time.localtime(tarinfo.mtime)[:6], - print tarinfo.name, + print tarinfo.name + ("/" if tarinfo.isdir() else ""), if verbose: if tarinfo.issym(): @@ -1454,7 +1913,7 @@ class TarFile(object): if recursive: if arcname == ".": arcname = "" - for f in os.listdir("."): + for f in os.listdir(name): self.add(f, os.path.join(arcname, f)) return @@ -1493,7 +1952,7 @@ class TarFile(object): tarinfo = copy.copy(tarinfo) - buf = tarinfo.tobuf(self.posix) + buf = tarinfo.tobuf(self.format, self.encoding) self.fileobj.write(buf) self.offset += len(buf) @@ -1525,7 +1984,7 @@ class TarFile(object): # Extract directory with a safe mode, so that # all files below can be extracted as well. try: - os.makedirs(os.path.join(path, tarinfo.name), 0777) + os.makedirs(os.path.join(path, tarinfo.name), 0700) except EnvironmentError: pass directories.append(tarinfo) @@ -1557,10 +2016,10 @@ class TarFile(object): """ self._check("r") - if isinstance(member, TarInfo): - tarinfo = member - else: + if isinstance(member, basestring): tarinfo = self.getmember(member) + else: + tarinfo = member # Prepare the link target for makelink(). if tarinfo.islnk(): @@ -1593,10 +2052,10 @@ class TarFile(object): """ self._check("r") - if isinstance(member, TarInfo): - tarinfo = member - else: + if isinstance(member, basestring): tarinfo = self.getmember(member) + else: + tarinfo = member if tarinfo.isreg(): return self.fileobject(self, tarinfo) @@ -1809,20 +2268,11 @@ class TarFile(object): # Read the next block. self.fileobj.seek(self.offset) while True: - buf = self.fileobj.read(BLOCKSIZE) - if not buf: - return None - try: - tarinfo = TarInfo.frombuf(buf) - - # Set the TarInfo object's offset to the current position of the - # TarFile and set self.offset to the position where the data blocks - # should begin. - tarinfo.offset = self.offset - self.offset += BLOCKSIZE - - tarinfo = self.proc_member(tarinfo) + tarinfo = self.tarinfo.fromtarfile(self) + if tarinfo is None: + return + self.members.append(tarinfo) except HeaderError, e: if self.ignore_zeros: @@ -1835,149 +2285,11 @@ class TarFile(object): return None break - # Some old tar programs represent a directory as a regular - # file with a trailing slash. - if tarinfo.isreg() and tarinfo.name.endswith("/"): - tarinfo.type = DIRTYPE - - # Directory names should have a '/' at the end. - if tarinfo.isdir(): - tarinfo.name += "/" - - self.members.append(tarinfo) - return tarinfo - - #-------------------------------------------------------------------------- - # The following are methods that are called depending on the type of a - # member. The entry point is proc_member() which is called with a TarInfo - # object created from the header block from the current offset. The - # proc_member() method can be overridden in a subclass to add custom - # proc_*() methods. A proc_*() method MUST implement the following - # operations: - # 1. Set tarinfo.offset_data to the position where the data blocks begin, - # if there is data that follows. - # 2. Set self.offset to the position where the next member's header will - # begin. - # 3. Return tarinfo or another valid TarInfo object. - def proc_member(self, tarinfo): - """Choose the right processing method for tarinfo depending - on its type and call it. - """ - if tarinfo.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): - return self.proc_gnulong(tarinfo) - elif tarinfo.type == GNUTYPE_SPARSE: - return self.proc_sparse(tarinfo) - else: - return self.proc_builtin(tarinfo) - - def proc_builtin(self, tarinfo): - """Process a builtin type member or an unknown member - which will be treated as a regular file. - """ - tarinfo.offset_data = self.offset - if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: - # Skip the following data blocks. - self.offset += self._block(tarinfo.size) - return tarinfo - - def proc_gnulong(self, tarinfo): - """Process the blocks that hold a GNU longname - or longlink member. - """ - buf = "" - count = tarinfo.size - while count > 0: - block = self.fileobj.read(BLOCKSIZE) - buf += block - self.offset += BLOCKSIZE - count -= BLOCKSIZE - - # Fetch the next header and process it. - b = self.fileobj.read(BLOCKSIZE) - t = TarInfo.frombuf(b) - t.offset = self.offset - self.offset += BLOCKSIZE - next = self.proc_member(t) - - # Patch the TarInfo object from the next header with - # the longname information. - next.offset = tarinfo.offset - if tarinfo.type == GNUTYPE_LONGNAME: - next.name = buf.rstrip(NUL) - elif tarinfo.type == GNUTYPE_LONGLINK: - next.linkname = buf.rstrip(NUL) - - return next - - def proc_sparse(self, tarinfo): - """Process a GNU sparse header plus extra headers. - """ - buf = tarinfo.buf - sp = _ringbuffer() - pos = 386 - lastpos = 0L - realpos = 0L - # There are 4 possible sparse structs in the - # first header. - for i in xrange(4): - try: - offset = nti(buf[pos:pos + 12]) - numbytes = nti(buf[pos + 12:pos + 24]) - except ValueError: - break - if offset > lastpos: - sp.append(_hole(lastpos, offset - lastpos)) - sp.append(_data(offset, numbytes, realpos)) - realpos += numbytes - lastpos = offset + numbytes - pos += 24 - - isextended = ord(buf[482]) - origsize = nti(buf[483:495]) - - # If the isextended flag is given, - # there are extra headers to process. - while isextended == 1: - buf = self.fileobj.read(BLOCKSIZE) - self.offset += BLOCKSIZE - pos = 0 - for i in xrange(21): - try: - offset = nti(buf[pos:pos + 12]) - numbytes = nti(buf[pos + 12:pos + 24]) - except ValueError: - break - if offset > lastpos: - sp.append(_hole(lastpos, offset - lastpos)) - sp.append(_data(offset, numbytes, realpos)) - realpos += numbytes - lastpos = offset + numbytes - pos += 24 - isextended = ord(buf[504]) - - if lastpos < origsize: - sp.append(_hole(lastpos, origsize - lastpos)) - - tarinfo.sparse = sp - - tarinfo.offset_data = self.offset - self.offset += self._block(tarinfo.size) - tarinfo.size = origsize - return tarinfo #-------------------------------------------------------------------------- # Little helper methods: - def _block(self, count): - """Round up a byte count by BLOCKSIZE and return it, - e.g. _block(834) => 1024. - """ - blocks, remainder = divmod(count, BLOCKSIZE) - if remainder: - blocks += 1 - return blocks * BLOCKSIZE - def _getmember(self, name, tarinfo=None): """Find an archive member by name from bottom to top. If tarinfo is given, it is used as the starting point. @@ -2010,8 +2322,8 @@ class TarFile(object): """ if self.closed: raise IOError("%s is closed" % self.__class__.__name__) - if mode is not None and self._mode not in mode: - raise IOError("bad operation for mode %r" % self._mode) + if mode is not None and self.mode not in mode: + raise IOError("bad operation for mode %r" % self.mode) def __iter__(self): """Provide an iterator object. |