diff options
Diffstat (limited to 'Lib/zipfile.py')
-rw-r--r-- | Lib/zipfile.py | 1074 |
1 files changed, 592 insertions, 482 deletions
diff --git a/Lib/zipfile.py b/Lib/zipfile.py index a382383e17..b223b4a4c1 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -3,8 +3,17 @@ Read and write ZIP files. XXX references to utf-8 need further investigation. """ -import struct, os, time, sys, shutil -import binascii, io, stat +import io +import os +import re +import imp +import sys +import time +import stat +import shutil +import struct +import binascii + try: import zlib # We may need its compression method @@ -13,10 +22,10 @@ except ImportError: zlib = None crc32 = binascii.crc32 -__all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile", - "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile" ] +__all__ = ["BadZipFile", "BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", + "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"] -class BadZipfile(Exception): +class BadZipFile(Exception): pass @@ -26,7 +35,8 @@ class LargeZipFile(Exception): and those extensions are disabled. """ -error = BadZipfile # The exception raised by this module +error = BadZipfile = BadZipFile # Pre-3.2 compatibility names + ZIP64_LIMIT = (1 << 31) - 1 ZIP_FILECOUNT_LIMIT = 1 << 16 @@ -166,16 +176,20 @@ def _EndRecData64(fpin, offset, endrec): return endrec data = fpin.read(sizeEndCentDir64Locator) + if len(data) != sizeEndCentDir64Locator: + return endrec sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) if sig != stringEndArchive64Locator: return endrec if diskno != 0 or disks != 1: - raise BadZipfile("zipfiles that span multiple disks are not supported") + raise BadZipFile("zipfiles that span multiple disks are not supported") # Assume no 'zip64 extensible data' fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) data = fpin.read(sizeEndCentDir64) + if len(data) != sizeEndCentDir64: + return endrec sig, sz, create_version, read_version, disk_num, disk_dir, \ dircount, dircount2, dirsize, diroffset = \ struct.unpack(structEndArchive64, data) @@ -211,7 +225,9 @@ def _EndRecData(fpin): except IOError: return None data = fpin.read() - if data[0:4] == stringEndArchive and data[-2:] == b"\000\000": + if (len(data) == sizeEndCentDir and + data[0:4] == stringEndArchive and + data[-2:] == b"\000\000"): # the signature is correct and there's no comment, unpack structure endrec = struct.unpack(structEndArchive, data) endrec=list(endrec) @@ -235,20 +251,21 @@ def _EndRecData(fpin): if start >= 0: # found the magic number; attempt to unpack and interpret recData = data[start:start+sizeEndCentDir] + if len(recData) != sizeEndCentDir: + # Zip file is corrupted. + return None endrec = list(struct.unpack(structEndArchive, recData)) - comment = data[start+sizeEndCentDir:] - # check that comment length is correct - if endrec[_ECD_COMMENT_SIZE] == len(comment): - # Append the archive comment and start offset - endrec.append(comment) - endrec.append(maxCommentStart + start) + commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file + comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize] + endrec.append(comment) + endrec.append(maxCommentStart + start) - # Try to read the "Zip64 end of central directory" structure - return _EndRecData64(fpin, maxCommentStart + start - filesize, - endrec) + # Try to read the "Zip64 end of central directory" structure + return _EndRecData64(fpin, maxCommentStart + start - filesize, + endrec) # Unable to find a valid end of central directory structure - return + return None class ZipInfo (object): @@ -292,6 +309,10 @@ class ZipInfo (object): self.filename = filename # Normalized file name self.date_time = date_time # year, month, day, hour, min, sec + + if date_time[0] < 1980: + raise ValueError('ZIP does not support timestamps before 1980') + # Standard values: self.compress_type = ZIP_STORED # Type of compression for the file self.comment = b"" # Comment for each file @@ -314,7 +335,7 @@ class ZipInfo (object): # compress_size Size of the compressed file # file_size Size of the uncompressed file - def FileHeader(self): + def FileHeader(self, zip64=None): """Return the per-file header as a string.""" dt = self.date_time dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] @@ -329,12 +350,17 @@ class ZipInfo (object): extra = self.extra - if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT: - # File is larger than what fits into a 4 byte integer, - # fall back to the ZIP64 extension + if zip64 is None: + zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT + if zip64: fmt = '<HHQQ' extra = extra + struct.pack(fmt, 1, struct.calcsize(fmt)-4, file_size, compress_size) + if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT: + if not zip64: + raise LargeZipFile("Filesize would require ZIP64 extensions") + # File is larger than what fits into a 4 byte integer, + # fall back to the ZIP64 extension file_size = 0xffffffff compress_size = 0xffffffff self.extract_version = max(45, self.extract_version) @@ -449,133 +475,161 @@ class _ZipDecrypter: self._UpdateKeys(c) return c -class ZipExtFile: + +compressor_names = { + 0: 'store', + 1: 'shrink', + 2: 'reduce', + 3: 'reduce', + 4: 'reduce', + 5: 'reduce', + 6: 'implode', + 7: 'tokenize', + 8: 'deflate', + 9: 'deflate64', + 10: 'implode', + 12: 'bzip2', + 14: 'lzma', + 18: 'terse', + 19: 'lz77', + 97: 'wavpack', + 98: 'ppmd', +} + + +class ZipExtFile(io.BufferedIOBase): """File-like object for reading an archive member. Is returned by ZipFile.open(). """ - def __init__(self, fileobj, zipinfo, decrypt=None): - self.fileobj = fileobj - self.decrypter = decrypt - self.bytes_read = 0 - self.rawbuffer = b'' - self.readbuffer = b'' - self.linebuffer = b'' - self.eof = False - self.univ_newlines = False - self.nlSeps = (b"\n", ) - self.lastdiscard = b'' - - self.compress_type = zipinfo.compress_type - self.compress_size = zipinfo.compress_size - - self.closed = False - self.mode = "r" - self.name = zipinfo.filename + # Max size supported by decompressor. + MAX_N = 1 << 31 - 1 - # read from compressed files in 64k blocks - self.compreadsize = 64*1024 - if self.compress_type == ZIP_DEFLATED: - self.dc = zlib.decompressobj(-15) + # Read from compressed files in 4k blocks. + MIN_READ_SIZE = 4096 - if hasattr(zipinfo, 'CRC'): - self._expected_crc = zipinfo.CRC - self._running_crc = crc32(b'') & 0xffffffff - else: - self._expected_crc = None + # Search for universal newlines or line chunks. + PATTERN = re.compile(br'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)') - def set_univ_newlines(self, univ_newlines): - self.univ_newlines = univ_newlines + def __init__(self, fileobj, mode, zipinfo, decrypter=None, + close_fileobj=False): + self._fileobj = fileobj + self._decrypter = decrypter + self._close_fileobj = close_fileobj - # pick line separator char(s) based on universal newlines flag - self.nlSeps = (b"\n", ) - if self.univ_newlines: - self.nlSeps = (b"\r\n", b"\r", b"\n") + self._compress_type = zipinfo.compress_type + self._compress_size = zipinfo.compress_size + self._compress_left = zipinfo.compress_size - def __iter__(self): - return self + if self._compress_type == ZIP_DEFLATED: + self._decompressor = zlib.decompressobj(-15) + elif self._compress_type != ZIP_STORED: + descr = compressor_names.get(self._compress_type) + if descr: + raise NotImplementedError("compression type %d (%s)" % (self._compress_type, descr)) + else: + raise NotImplementedError("compression type %d" % (self._compress_type,)) + self._unconsumed = b'' - def __next__(self): - nextline = self.readline() - if not nextline: - raise StopIteration() + self._readbuffer = b'' + self._offset = 0 - return nextline + self._universal = 'U' in mode + self.newlines = None - def close(self): - self.closed = True - - def _checkfornewline(self): - nl, nllen = -1, -1 - if self.linebuffer: - # ugly check for cases where half of an \r\n pair was - # read on the last pass, and the \r was discarded. In this - # case we just throw away the \n at the start of the buffer. - if (self.lastdiscard, self.linebuffer[:1]) == (b'\r', b'\n'): - self.linebuffer = self.linebuffer[1:] - - for sep in self.nlSeps: - nl = self.linebuffer.find(sep) - if nl >= 0: - nllen = len(sep) - return nl, nllen - - return nl, nllen - - def readline(self, size = -1): - """Read a line with approx. size. If size is negative, - read a whole line. - """ - if size < 0: - size = sys.maxsize - elif size == 0: - return b'' + # Adjust read size for encrypted files since the first 12 bytes + # are for the encryption/password information. + if self._decrypter is not None: + self._compress_left -= 12 - # check for a newline already in buffer - nl, nllen = self._checkfornewline() + self.mode = mode + self.name = zipinfo.filename - if nl >= 0: - # the next line was already in the buffer - nl = min(nl, size) + if hasattr(zipinfo, 'CRC'): + self._expected_crc = zipinfo.CRC + self._running_crc = crc32(b'') & 0xffffffff else: - # no line break in buffer - try to read more - size -= len(self.linebuffer) - while nl < 0 and size > 0: - buf = self.read(min(size, 100)) - if not buf: - break - self.linebuffer += buf - size -= len(buf) - - # check for a newline in buffer - nl, nllen = self._checkfornewline() - - # we either ran out of bytes in the file, or - # met the specified size limit without finding a newline, - # so return current buffer - if nl < 0: - s = self.linebuffer - self.linebuffer = b'' - return s - - buf = self.linebuffer[:nl] - self.lastdiscard = self.linebuffer[nl:nl + nllen] - self.linebuffer = self.linebuffer[nl + nllen:] - - # line is always returned with \n as newline char (except possibly - # for a final incomplete line in the file, which is handled above). - return buf + b"\n" - - def readlines(self, sizehint = -1): - """Return a list with all (following) lines. The sizehint parameter - is ignored in this implementation. + self._expected_crc = None + + def readline(self, limit=-1): + """Read and return a line from the stream. + + If limit is specified, at most limit bytes will be read. + """ + + if not self._universal and limit < 0: + # Shortcut common case - newline found in buffer. + i = self._readbuffer.find(b'\n', self._offset) + 1 + if i > 0: + line = self._readbuffer[self._offset: i] + self._offset = i + return line + + if not self._universal: + return io.BufferedIOBase.readline(self, limit) + + line = b'' + while limit < 0 or len(line) < limit: + readahead = self.peek(2) + if readahead == b'': + return line + + # + # Search for universal newlines or line chunks. + # + # The pattern returns either a line chunk or a newline, but not + # both. Combined with peek(2), we are assured that the sequence + # '\r\n' is always retrieved completely and never split into + # separate newlines - '\r', '\n' due to coincidental readaheads. + # + match = self.PATTERN.search(readahead) + newline = match.group('newline') + if newline is not None: + if self.newlines is None: + self.newlines = [] + if newline not in self.newlines: + self.newlines.append(newline) + self._offset += len(newline) + return line + b'\n' + + chunk = match.group('chunk') + if limit >= 0: + chunk = chunk[: limit - len(line)] + + self._offset += len(chunk) + line += chunk + + return line + + def peek(self, n=1): + """Returns buffered bytes without advancing the position.""" + if n > len(self._readbuffer) - self._offset: + chunk = self.read(n) + self._offset -= len(chunk) + + # Return up to 512 bytes to reduce allocation overhead for tight loops. + return self._readbuffer[self._offset: self._offset + 512] + + def readable(self): + return True + + def read(self, n=-1): + """Read and return up to n bytes. + If the argument is omitted, None, or negative, data is read and returned until EOF is reached.. """ - result = [] + buf = b'' + if n is None: + n = -1 while True: - line = self.readline() - if not line: break - result.append(line) - return result + if n < 0: + data = self.read1(n) + elif n > len(buf): + data = self.read1(n - len(buf)) + else: + return buf + if len(data) == 0: + return buf + buf += data def _update_crc(self, newdata, eof): # Update the CRC using the given data. @@ -585,86 +639,67 @@ class ZipExtFile: self._running_crc = crc32(newdata, self._running_crc) & 0xffffffff # Check the CRC if we're at the end of the file if eof and self._running_crc != self._expected_crc: - raise BadZipfile("Bad CRC-32 for file %r" % self.name) - - def read(self, size = None): - # act like file obj and return empty string if size is 0 - if size == 0: - return b'' - - # determine read size - bytesToRead = self.compress_size - self.bytes_read - - # adjust read size for encrypted files since the first 12 bytes - # are for the encryption/password information - if self.decrypter is not None: - bytesToRead -= 12 - - if size is not None and size >= 0: - if self.compress_type == ZIP_STORED: - lr = len(self.readbuffer) - bytesToRead = min(bytesToRead, size - lr) - elif self.compress_type == ZIP_DEFLATED: - if len(self.readbuffer) > size: - # the user has requested fewer bytes than we've already - # pulled through the decompressor; don't read any more - bytesToRead = 0 - else: - # user will use up the buffer, so read some more - lr = len(self.rawbuffer) - bytesToRead = min(bytesToRead, self.compreadsize - lr) - - # avoid reading past end of file contents - if bytesToRead + self.bytes_read > self.compress_size: - bytesToRead = self.compress_size - self.bytes_read - - # try to read from file (if necessary) - if bytesToRead > 0: - data = self.fileobj.read(bytesToRead) - self.bytes_read += len(data) - try: - self.rawbuffer += data - except: - print(repr(self.fileobj), repr(self.rawbuffer), - repr(data)) - raise - - # handle contents of raw buffer - if self.rawbuffer: - newdata = self.rawbuffer - self.rawbuffer = b'' - - # decrypt new data if we were given an object to handle that - if newdata and self.decrypter is not None: - newdata = bytes(map(self.decrypter, newdata)) - - # decompress newly read data if necessary - if newdata and self.compress_type == ZIP_DEFLATED: - newdata = self.dc.decompress(newdata) - self.rawbuffer = self.dc.unconsumed_tail - if self.eof and len(self.rawbuffer) == 0: - # we're out of raw bytes (both from the file and - # the local buffer); flush just to make sure the - # decompressor is done - newdata += self.dc.flush() - # prevent decompressor from being used again - self.dc = None - - self._update_crc(newdata, eof=( - self.compress_size == self.bytes_read and - len(self.rawbuffer) == 0)) - self.readbuffer += newdata - - # return what the user asked for - if size is None or len(self.readbuffer) <= size: - data = self.readbuffer - self.readbuffer = b'' - else: - data = self.readbuffer[:size] - self.readbuffer = self.readbuffer[size:] + raise BadZipFile("Bad CRC-32 for file %r" % self.name) + + def read1(self, n): + """Read up to n bytes with at most one read() system call.""" + + # Simplify algorithm (branching) by transforming negative n to large n. + if n < 0 or n is None: + n = self.MAX_N + # Bytes available in read buffer. + len_readbuffer = len(self._readbuffer) - self._offset + + # Read from file. + if self._compress_left > 0 and n > len_readbuffer + len(self._unconsumed): + nbytes = n - len_readbuffer - len(self._unconsumed) + nbytes = max(nbytes, self.MIN_READ_SIZE) + nbytes = min(nbytes, self._compress_left) + + data = self._fileobj.read(nbytes) + self._compress_left -= len(data) + + if data and self._decrypter is not None: + data = bytes(map(self._decrypter, data)) + + if self._compress_type == ZIP_STORED: + self._update_crc(data, eof=(self._compress_left==0)) + self._readbuffer = self._readbuffer[self._offset:] + data + self._offset = 0 + else: + # Prepare deflated bytes for decompression. + self._unconsumed += data + + # Handle unconsumed data. + if (len(self._unconsumed) > 0 and n > len_readbuffer and + self._compress_type == ZIP_DEFLATED): + data = self._decompressor.decompress( + self._unconsumed, + max(n - len_readbuffer, self.MIN_READ_SIZE) + ) + + self._unconsumed = self._decompressor.unconsumed_tail + eof = len(self._unconsumed) == 0 and self._compress_left == 0 + if eof: + data += self._decompressor.flush() + + self._update_crc(data, eof=eof) + self._readbuffer = self._readbuffer[self._offset:] + data + self._offset = 0 + + # Read from buffer. + data = self._readbuffer[self._offset: self._offset + n] + self._offset += len(data) return data + def close(self): + try: + if self._close_fileobj: + self._fileobj.close() + finally: + super().close() + class ZipFile: """ Class with methods to open, read, write, close, list zip files. @@ -705,7 +740,7 @@ class ZipFile: self.compression = compression # Method of compression self.mode = key = mode.replace('b', '')[0] self.pwd = None - self.comment = b'' + self._comment = b'' # Check if we were passed a file-like object if isinstance(file, str): @@ -726,56 +761,55 @@ class ZipFile: self.fp = file self.filename = getattr(file, 'name', None) - if key == 'r': - self._GetContents() - elif key == 'w': - # set the modified flag so central directory gets written - # even if no files are added to the archive - self._didModify = True - elif key == 'a': - try: - # See if file is a zip file + try: + if key == 'r': self._RealGetContents() - # seek to start of directory and overwrite - self.fp.seek(self.start_dir, 0) - except BadZipfile: - # file is not a zip file, just append - self.fp.seek(0, 2) - + elif key == 'w': # set the modified flag so central directory gets written # even if no files are added to the archive self._didModify = True - else: - if not self._filePassed: - self.fp.close() - self.fp = None - raise RuntimeError('Mode must be "r", "w" or "a"') - - def _GetContents(self): - """Read the directory, making sure we close the file if the format - is bad.""" - try: - self._RealGetContents() - except BadZipfile: + elif key == 'a': + try: + # See if file is a zip file + self._RealGetContents() + # seek to start of directory and overwrite + self.fp.seek(self.start_dir, 0) + except BadZipFile: + # file is not a zip file, just append + self.fp.seek(0, 2) + + # set the modified flag so central directory gets written + # even if no files are added to the archive + self._didModify = True + else: + raise RuntimeError('Mode must be "r", "w" or "a"') + except: + fp = self.fp + self.fp = None if not self._filePassed: - self.fp.close() - self.fp = None + fp.close() raise + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + def _RealGetContents(self): """Read in the table of contents for the ZIP file.""" fp = self.fp try: endrec = _EndRecData(fp) except IOError: - raise BadZipfile("File is not a zip file") + raise BadZipFile("File is not a zip file") if not endrec: - raise BadZipfile("File is not a zip file") + raise BadZipFile("File is not a zip file") if self.debug > 1: print(endrec) size_cd = endrec[_ECD_SIZE] # bytes in central directory offset_cd = endrec[_ECD_OFFSET] # offset of central directory - self.comment = endrec[_ECD_COMMENT] # archive comment + self._comment = endrec[_ECD_COMMENT] # archive comment # "concat" is zero, unless zip was concatenated to another file concat = endrec[_ECD_LOCATION] - size_cd - offset_cd @@ -794,9 +828,11 @@ class ZipFile: total = 0 while total < size_cd: centdir = fp.read(sizeCentralDir) - if centdir[0:4] != stringCentralDir: - raise BadZipfile("Bad magic number for central directory") + if len(centdir) != sizeCentralDir: + raise BadZipFile("Truncated central directory") centdir = struct.unpack(structCentralDir, centdir) + if centdir[_CD_SIGNATURE] != stringCentralDir: + raise BadZipFile("Bad magic number for central directory") if self.debug > 2: print(centdir) filename = fp.read(centdir[_CD_FILENAME_LENGTH]) @@ -863,10 +899,10 @@ class ZipFile: try: # Read by chunks, to avoid an OverflowError or a # MemoryError with very large embedded files. - f = self.open(zinfo.filename, "r") - while f.read(chunk_size): # Check CRC-32 - pass - except BadZipfile: + with self.open(zinfo.filename, "r") as f: + while f.read(chunk_size): # Check CRC-32 + pass + except BadZipFile: return zinfo.filename def getinfo(self, name): @@ -887,9 +923,28 @@ class ZipFile: else: self.pwd = None + @property + def comment(self): + """The comment text associated with the ZIP file.""" + return self._comment + + @comment.setter + def comment(self, comment): + if not isinstance(comment, bytes): + raise TypeError("comment: expected bytes, got %s" % type(comment)) + # check for valid comment length + if len(comment) >= ZIP_MAX_COMMENT: + if self.debug: + print('Archive comment is too long; truncating to %d bytes' + % ZIP_MAX_COMMENT) + comment = comment[:ZIP_MAX_COMMENT] + self._comment = comment + self._didModify = True + def read(self, name, pwd=None): """Return file bytes (as a string) for name.""" - return self.open(name, "r", pwd).read() + with self.open(name, "r", pwd) as fp: + return fp.read() def open(self, name, mode="r", pwd=None): """Return file-like object for 'name'.""" @@ -908,74 +963,72 @@ class ZipFile: else: zef_file = io.open(self.filename, 'rb') - # Make sure we have an info object - if isinstance(name, ZipInfo): - # 'name' is already an info object - zinfo = name - else: - # Get info object for name - zinfo = self.getinfo(name) - - zef_file.seek(zinfo.header_offset, 0) - - # Skip the file header: - fheader = zef_file.read(sizeFileHeader) - if fheader[0:4] != stringFileHeader: - raise BadZipfile("Bad magic number for file header") - - fheader = struct.unpack(structFileHeader, fheader) - fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) - if fheader[_FH_EXTRA_FIELD_LENGTH]: - zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) - - if zinfo.flag_bits & 0x800: - # UTF-8 filename - fname_str = fname.decode("utf-8") - else: - fname_str = fname.decode("cp437") - - if fname_str != zinfo.orig_filename: - raise BadZipfile( - 'File name in directory %r and header %r differ.' - % (zinfo.orig_filename, fname)) - - # check for encrypted flag & handle password - is_encrypted = zinfo.flag_bits & 0x1 - zd = None - if is_encrypted: - if not pwd: - pwd = self.pwd - if not pwd: - raise RuntimeError("File %s is encrypted, " - "password required for extraction" % name) - - zd = _ZipDecrypter(pwd) - # The first 12 bytes in the cypher stream is an encryption header - # used to strengthen the algorithm. The first 11 bytes are - # completely random, while the 12th contains the MSB of the CRC, - # or the MSB of the file time depending on the header type - # and is used to check the correctness of the password. - header = zef_file.read(12) - h = list(map(zd, header[0:12])) - if zinfo.flag_bits & 0x8: - # compare against the file type from extended local headers - check_byte = (zinfo._raw_time >> 8) & 0xff + try: + # Make sure we have an info object + if isinstance(name, ZipInfo): + # 'name' is already an info object + zinfo = name else: - # compare against the CRC otherwise - check_byte = (zinfo.CRC >> 24) & 0xff - if h[11] != check_byte: - raise RuntimeError("Bad password for file", name) - - # build and return a ZipExtFile - if zd is None: - zef = ZipExtFile(zef_file, zinfo) - else: - zef = ZipExtFile(zef_file, zinfo, zd) - - # set universal newlines on ZipExtFile if necessary - if "U" in mode: - zef.set_univ_newlines(True) - return zef + # Get info object for name + zinfo = self.getinfo(name) + zef_file.seek(zinfo.header_offset, 0) + + # Skip the file header: + fheader = zef_file.read(sizeFileHeader) + if len(fheader) != sizeFileHeader: + raise BadZipFile("Truncated file header") + fheader = struct.unpack(structFileHeader, fheader) + if fheader[_FH_SIGNATURE] != stringFileHeader: + raise BadZipFile("Bad magic number for file header") + + fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) + if fheader[_FH_EXTRA_FIELD_LENGTH]: + zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) + + if zinfo.flag_bits & 0x800: + # UTF-8 filename + fname_str = fname.decode("utf-8") + else: + fname_str = fname.decode("cp437") + + if fname_str != zinfo.orig_filename: + raise BadZipFile( + 'File name in directory %r and header %r differ.' + % (zinfo.orig_filename, fname)) + + # check for encrypted flag & handle password + is_encrypted = zinfo.flag_bits & 0x1 + zd = None + if is_encrypted: + if not pwd: + pwd = self.pwd + if not pwd: + raise RuntimeError("File %s is encrypted, password " + "required for extraction" % name) + + zd = _ZipDecrypter(pwd) + # The first 12 bytes in the cypher stream is an encryption header + # used to strengthen the algorithm. The first 11 bytes are + # completely random, while the 12th contains the MSB of the CRC, + # or the MSB of the file time depending on the header type + # and is used to check the correctness of the password. + header = zef_file.read(12) + h = list(map(zd, header[0:12])) + if zinfo.flag_bits & 0x8: + # compare against the file type from extended local headers + check_byte = (zinfo._raw_time >> 8) & 0xff + else: + # compare against the CRC otherwise + check_byte = (zinfo.CRC >> 24) & 0xff + if h[11] != check_byte: + raise RuntimeError("Bad password for file", name) + + return ZipExtFile(zef_file, mode, zinfo, zd, + close_fileobj=not self._filePassed) + except: + if not self._filePassed: + zef_file.close() + raise def extract(self, member, path=None, pwd=None): """Extract a member from the archive to the current working directory, @@ -1009,17 +1062,25 @@ class ZipFile: """ # build the destination pathname, replacing # forward slashes to platform specific separators. - # Strip trailing path separator, unless it represents the root. - if (targetpath[-1:] in (os.path.sep, os.path.altsep) - and len(os.path.splitdrive(targetpath)[1]) > 1): - targetpath = targetpath[:-1] - - # don't include leading "/" from file name if present - if member.filename[0] == '/': - targetpath = os.path.join(targetpath, member.filename[1:]) - else: - targetpath = os.path.join(targetpath, member.filename) - + arcname = member.filename.replace('/', os.path.sep) + + if os.path.altsep: + arcname = arcname.replace(os.path.altsep, os.path.sep) + # interpret absolute pathname as relative, remove drive letter or + # UNC path, redundant separators, "." and ".." components. + arcname = os.path.splitdrive(arcname)[1] + arcname = os.path.sep.join(x for x in arcname.split(os.path.sep) + if x not in ('', os.path.curdir, os.path.pardir)) + if os.path.sep == '\\': + # filter illegal characters on Windows + illegal = ':<>|"?*' + table = str.maketrans(illegal, '_' * len(illegal)) + arcname = arcname.translate(table) + # remove trailing dots + arcname = (x.rstrip('.') for x in arcname.split(os.path.sep)) + arcname = os.path.sep.join(x for x in arcname if x) + + targetpath = os.path.join(targetpath, arcname) targetpath = os.path.normpath(targetpath) # Create all upper directories if necessary. @@ -1032,11 +1093,9 @@ class ZipFile: os.mkdir(targetpath) return targetpath - source = self.open(member, pwd=pwd) - target = open(targetpath, "wb") - shutil.copyfileobj(source, target) - source.close() - target.close() + with self.open(member, pwd=pwd) as source, \ + open(targetpath, "wb") as target: + shutil.copyfileobj(source, target) return targetpath @@ -1102,20 +1161,23 @@ class ZipFile: zinfo.CRC = 0 self.filelist.append(zinfo) self.NameToInfo[zinfo.filename] = zinfo - self.fp.write(zinfo.FileHeader()) + self.fp.write(zinfo.FileHeader(False)) return with open(filename, "rb") as fp: # Must overwrite CRC and sizes with correct data later zinfo.CRC = CRC = 0 zinfo.compress_size = compress_size = 0 - zinfo.file_size = file_size = 0 - self.fp.write(zinfo.FileHeader()) + # Compressed size can be larger than uncompressed size + zip64 = self._allowZip64 and \ + zinfo.file_size * 1.05 > ZIP64_LIMIT + self.fp.write(zinfo.FileHeader(zip64)) if zinfo.compress_type == ZIP_DEFLATED: cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15) else: cmpr = None + file_size = 0 while 1: buf = fp.read(1024 * 8) if not buf: @@ -1135,16 +1197,21 @@ class ZipFile: zinfo.compress_size = file_size zinfo.CRC = CRC zinfo.file_size = file_size - # Seek backwards and write CRC and file sizes + if not zip64 and self._allowZip64: + if file_size > ZIP64_LIMIT: + raise RuntimeError('File size has increased during compressing') + if compress_size > ZIP64_LIMIT: + raise RuntimeError('Compressed size larger than uncompressed size') + # Seek backwards and write file header (which will now include + # correct CRC and file sizes) position = self.fp.tell() # Preserve current position in file - self.fp.seek(zinfo.header_offset + 14, 0) - self.fp.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size, - zinfo.file_size)) + self.fp.seek(zinfo.header_offset, 0) + self.fp.write(zinfo.FileHeader(zip64)) self.fp.seek(position, 0) self.filelist.append(zinfo) self.NameToInfo[zinfo.filename] = zinfo - def writestr(self, zinfo_or_arcname, data): + def writestr(self, zinfo_or_arcname, data, compress_type=None): """Write a file into the archive. The contents is 'data', which may be either a 'str' or a 'bytes' instance; if it is a 'str', it is encoded as UTF-8 first. @@ -1166,6 +1233,9 @@ class ZipFile: zinfo.file_size = len(data) # Uncompressed size zinfo.header_offset = self.fp.tell() # Start of header data + if compress_type is not None: + zinfo.compress_type = compress_type + self._writecheck(zinfo) self._didModify = True zinfo.CRC = crc32(data) & 0xffffffff # CRC-32 checksum @@ -1176,14 +1246,18 @@ class ZipFile: zinfo.compress_size = len(data) # Compressed size else: zinfo.compress_size = zinfo.file_size - zinfo.header_offset = self.fp.tell() # Start of header data - self.fp.write(zinfo.FileHeader()) + zip64 = zinfo.file_size > ZIP64_LIMIT or \ + zinfo.compress_size > ZIP64_LIMIT + if zip64 and not self._allowZip64: + raise LargeZipFile("Filesize would require ZIP64 extensions") + self.fp.write(zinfo.FileHeader(zip64)) self.fp.write(data) - self.fp.flush() if zinfo.flag_bits & 0x08: # Write CRC and file sizes after the file data - self.fp.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size, + fmt = '<LQQ' if zip64 else '<LLL' + self.fp.write(struct.pack(fmt, zinfo.CRC, zinfo.compress_size, zinfo.file_size)) + self.fp.flush() self.filelist.append(zinfo) self.NameToInfo[zinfo.filename] = zinfo @@ -1197,113 +1271,114 @@ class ZipFile: if self.fp is None: return - if self.mode in ("w", "a") and self._didModify: # write ending records - count = 0 - pos1 = self.fp.tell() - for zinfo in self.filelist: # write central directory - count = count + 1 - dt = zinfo.date_time - dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] - dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) - extra = [] - if zinfo.file_size > ZIP64_LIMIT \ - or zinfo.compress_size > ZIP64_LIMIT: - extra.append(zinfo.file_size) - extra.append(zinfo.compress_size) - file_size = 0xffffffff - compress_size = 0xffffffff - else: - file_size = zinfo.file_size - compress_size = zinfo.compress_size - - if zinfo.header_offset > ZIP64_LIMIT: - extra.append(zinfo.header_offset) - header_offset = 0xffffffff - else: - header_offset = zinfo.header_offset - - extra_data = zinfo.extra - if extra: - # Append a ZIP64 field to the extra's - extra_data = struct.pack( - '<HH' + 'Q'*len(extra), - 1, 8*len(extra), *extra) + extra_data - - extract_version = max(45, zinfo.extract_version) - create_version = max(45, zinfo.create_version) - else: - extract_version = zinfo.extract_version - create_version = zinfo.create_version - - try: - filename, flag_bits = zinfo._encodeFilenameFlags() - centdir = struct.pack(structCentralDir, - stringCentralDir, create_version, - zinfo.create_system, extract_version, zinfo.reserved, - flag_bits, zinfo.compress_type, dostime, dosdate, - zinfo.CRC, compress_size, file_size, - len(filename), len(extra_data), len(zinfo.comment), - 0, zinfo.internal_attr, zinfo.external_attr, - header_offset) - except DeprecationWarning: - print((structCentralDir, stringCentralDir, create_version, - zinfo.create_system, extract_version, zinfo.reserved, - zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, - zinfo.CRC, compress_size, file_size, - len(zinfo.filename), len(extra_data), len(zinfo.comment), - 0, zinfo.internal_attr, zinfo.external_attr, - header_offset), file=sys.stderr) - raise - self.fp.write(centdir) - self.fp.write(filename) - self.fp.write(extra_data) - self.fp.write(zinfo.comment) - - pos2 = self.fp.tell() - # Write end-of-zip-archive record - centDirCount = count - centDirSize = pos2 - pos1 - centDirOffset = pos1 - if (centDirCount >= ZIP_FILECOUNT_LIMIT or - centDirOffset > ZIP64_LIMIT or - centDirSize > ZIP64_LIMIT): - # Need to write the ZIP64 end-of-archive records - zip64endrec = struct.pack( - structEndArchive64, stringEndArchive64, - 44, 45, 45, 0, 0, centDirCount, centDirCount, - centDirSize, centDirOffset) - self.fp.write(zip64endrec) - - zip64locrec = struct.pack( - structEndArchive64Locator, - stringEndArchive64Locator, 0, pos2, 1) - self.fp.write(zip64locrec) - centDirCount = min(centDirCount, 0xFFFF) - centDirSize = min(centDirSize, 0xFFFFFFFF) - centDirOffset = min(centDirOffset, 0xFFFFFFFF) - - # check for valid comment length - if len(self.comment) >= ZIP_MAX_COMMENT: - if self.debug > 0: - msg = 'Archive comment is too long; truncating to %d bytes' \ - % ZIP_MAX_COMMENT - self.comment = self.comment[:ZIP_MAX_COMMENT] - - endrec = struct.pack(structEndArchive, stringEndArchive, - 0, 0, centDirCount, centDirCount, - centDirSize, centDirOffset, len(self.comment)) - self.fp.write(endrec) - self.fp.write(self.comment) - self.fp.flush() - - if not self._filePassed: - self.fp.close() - self.fp = None + try: + if self.mode in ("w", "a") and self._didModify: # write ending records + count = 0 + pos1 = self.fp.tell() + for zinfo in self.filelist: # write central directory + count = count + 1 + dt = zinfo.date_time + dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] + dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) + extra = [] + if zinfo.file_size > ZIP64_LIMIT \ + or zinfo.compress_size > ZIP64_LIMIT: + extra.append(zinfo.file_size) + extra.append(zinfo.compress_size) + file_size = 0xffffffff + compress_size = 0xffffffff + else: + file_size = zinfo.file_size + compress_size = zinfo.compress_size + + if zinfo.header_offset > ZIP64_LIMIT: + extra.append(zinfo.header_offset) + header_offset = 0xffffffff + else: + header_offset = zinfo.header_offset + + extra_data = zinfo.extra + if extra: + # Append a ZIP64 field to the extra's + extra_data = struct.pack( + '<HH' + 'Q'*len(extra), + 1, 8*len(extra), *extra) + extra_data + + extract_version = max(45, zinfo.extract_version) + create_version = max(45, zinfo.create_version) + else: + extract_version = zinfo.extract_version + create_version = zinfo.create_version + + try: + filename, flag_bits = zinfo._encodeFilenameFlags() + centdir = struct.pack(structCentralDir, + stringCentralDir, create_version, + zinfo.create_system, extract_version, zinfo.reserved, + flag_bits, zinfo.compress_type, dostime, dosdate, + zinfo.CRC, compress_size, file_size, + len(filename), len(extra_data), len(zinfo.comment), + 0, zinfo.internal_attr, zinfo.external_attr, + header_offset) + except DeprecationWarning: + print((structCentralDir, stringCentralDir, create_version, + zinfo.create_system, extract_version, zinfo.reserved, + zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, + zinfo.CRC, compress_size, file_size, + len(zinfo.filename), len(extra_data), len(zinfo.comment), + 0, zinfo.internal_attr, zinfo.external_attr, + header_offset), file=sys.stderr) + raise + self.fp.write(centdir) + self.fp.write(filename) + self.fp.write(extra_data) + self.fp.write(zinfo.comment) + + pos2 = self.fp.tell() + # Write end-of-zip-archive record + centDirCount = count + centDirSize = pos2 - pos1 + centDirOffset = pos1 + if (centDirCount >= ZIP_FILECOUNT_LIMIT or + centDirOffset > ZIP64_LIMIT or + centDirSize > ZIP64_LIMIT): + # Need to write the ZIP64 end-of-archive records + zip64endrec = struct.pack( + structEndArchive64, stringEndArchive64, + 44, 45, 45, 0, 0, centDirCount, centDirCount, + centDirSize, centDirOffset) + self.fp.write(zip64endrec) + + zip64locrec = struct.pack( + structEndArchive64Locator, + stringEndArchive64Locator, 0, pos2, 1) + self.fp.write(zip64locrec) + centDirCount = min(centDirCount, 0xFFFF) + centDirSize = min(centDirSize, 0xFFFFFFFF) + centDirOffset = min(centDirOffset, 0xFFFFFFFF) + + endrec = struct.pack(structEndArchive, stringEndArchive, + 0, 0, centDirCount, centDirCount, + centDirSize, centDirOffset, len(self._comment)) + self.fp.write(endrec) + self.fp.write(self._comment) + self.fp.flush() + finally: + fp = self.fp + self.fp = None + if not self._filePassed: + fp.close() class PyZipFile(ZipFile): """Class to create ZIP archives with Python library files and packages.""" + def __init__(self, file, mode="r", compression=ZIP_STORED, + allowZip64=False, optimize=-1): + ZipFile.__init__(self, file, mode=mode, compression=compression, + allowZip64=allowZip64) + self._optimize = optimize + def writepy(self, pathname, basename=""): """Add all files from "pathname" to the ZIP archive. @@ -1376,25 +1451,64 @@ class PyZipFile(ZipFile): archive name, compiling if necessary. For example, given /python/lib/string, return (/python/lib/string.pyc, string). """ - file_py = pathname + ".py" - file_pyc = pathname + ".pyc" - file_pyo = pathname + ".pyo" - if os.path.isfile(file_pyo) and \ - os.stat(file_pyo).st_mtime >= os.stat(file_py).st_mtime: - fname = file_pyo # Use .pyo file - elif not os.path.isfile(file_pyc) or \ - os.stat(file_pyc).st_mtime < os.stat(file_py).st_mtime: + def _compile(file, optimize=-1): import py_compile if self.debug: - print("Compiling", file_py) + print("Compiling", file) try: - py_compile.compile(file_py, file_pyc, None, True) + py_compile.compile(file, doraise=True, optimize=optimize) except py_compile.PyCompileError as err: print(err.msg) - fname = file_pyc + return False + return True + + file_py = pathname + ".py" + file_pyc = pathname + ".pyc" + file_pyo = pathname + ".pyo" + pycache_pyc = imp.cache_from_source(file_py, True) + pycache_pyo = imp.cache_from_source(file_py, False) + if self._optimize == -1: + # legacy mode: use whatever file is present + if (os.path.isfile(file_pyo) and + os.stat(file_pyo).st_mtime >= os.stat(file_py).st_mtime): + # Use .pyo file. + arcname = fname = file_pyo + elif (os.path.isfile(file_pyc) and + os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime): + # Use .pyc file. + arcname = fname = file_pyc + elif (os.path.isfile(pycache_pyc) and + os.stat(pycache_pyc).st_mtime >= os.stat(file_py).st_mtime): + # Use the __pycache__/*.pyc file, but write it to the legacy pyc + # file name in the archive. + fname = pycache_pyc + arcname = file_pyc + elif (os.path.isfile(pycache_pyo) and + os.stat(pycache_pyo).st_mtime >= os.stat(file_py).st_mtime): + # Use the __pycache__/*.pyo file, but write it to the legacy pyo + # file name in the archive. + fname = pycache_pyo + arcname = file_pyo + else: + # Compile py into PEP 3147 pyc file. + if _compile(file_py): + fname = (pycache_pyc if __debug__ else pycache_pyo) + arcname = (file_pyc if __debug__ else file_pyo) + else: + fname = arcname = file_py else: - fname = file_pyc - archivename = os.path.split(fname)[1] + # new mode: use given optimization level + if self._optimize == 0: + fname = pycache_pyc + arcname = file_pyc + else: + fname = pycache_pyo + arcname = file_pyo + if not (os.path.isfile(fname) and + os.stat(fname).st_mtime >= os.stat(file_py).st_mtime): + if not _compile(file_py, optimize=self._optimize): + fname = arcname = file_py + archivename = os.path.split(arcname)[1] if basename: archivename = "%s/%s" % (basename, archivename) return (fname, archivename) @@ -1420,16 +1534,15 @@ def main(args = None): if len(args) != 2: print(USAGE) sys.exit(1) - zf = ZipFile(args[1], 'r') - zf.printdir() - zf.close() + with ZipFile(args[1], 'r') as zf: + zf.printdir() elif args[0] == '-t': if len(args) != 2: print(USAGE) sys.exit(1) - zf = ZipFile(args[1], 'r') - badfile = zf.testzip() + with ZipFile(args[1], 'r') as zf: + badfile = zf.testzip() if badfile: print("The following enclosed file is corrupted: {!r}".format(badfile)) print("Done testing") @@ -1439,20 +1552,19 @@ def main(args = None): print(USAGE) sys.exit(1) - zf = ZipFile(args[1], 'r') - out = args[2] - for path in zf.namelist(): - if path.startswith('./'): - tgt = os.path.join(out, path[2:]) - else: - tgt = os.path.join(out, path) + with ZipFile(args[1], 'r') as zf: + out = args[2] + for path in zf.namelist(): + if path.startswith('./'): + tgt = os.path.join(out, path[2:]) + else: + tgt = os.path.join(out, path) - tgtdir = os.path.dirname(tgt) - if not os.path.exists(tgtdir): - os.makedirs(tgtdir) - with open(tgt, 'wb') as fp: - fp.write(zf.read(path)) - zf.close() + tgtdir = os.path.dirname(tgt) + if not os.path.exists(tgtdir): + os.makedirs(tgtdir) + with open(tgt, 'wb') as fp: + fp.write(zf.read(path)) elif args[0] == '-c': if len(args) < 3: @@ -1468,11 +1580,9 @@ def main(args = None): os.path.join(path, nm), os.path.join(zippath, nm)) # else: ignore - zf = ZipFile(args[1], 'w', allowZip64=True) - for src in args[2:]: - addToZip(zf, src, os.path.basename(src)) - - zf.close() + with ZipFile(args[1], 'w', allowZip64=True) as zf: + for src in args[2:]: + addToZip(zf, src, os.path.basename(src)) if __name__ == "__main__": main() |