diff options
Diffstat (limited to 'Lib/gzip.py')
-rw-r--r-- | Lib/gzip.py | 273 |
1 files changed, 185 insertions, 88 deletions
diff --git a/Lib/gzip.py b/Lib/gzip.py index 8a2a7184df..6aacc9a4f9 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -5,11 +5,12 @@ but random access is not allowed.""" # based on Andrew Kuchling's minigzip.py distributed with the zlib module -import struct, sys, time +import struct, sys, time, os import zlib import builtins +import io -__all__ = ["GzipFile","open"] +__all__ = ["GzipFile", "open", "compress", "decompress"] FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 @@ -44,10 +45,69 @@ def open(filename, mode="rb", compresslevel=9): """ return GzipFile(filename, mode, compresslevel) -class GzipFile: +class _PaddedFile: + """Minimal read-only file object that prepends a string to the contents + of an actual file. Shouldn't be used outside of gzip.py, as it lacks + essential functionality.""" + + def __init__(self, f, prepend=b''): + self._buffer = prepend + self._length = len(prepend) + self.file = f + self._read = 0 + + def read(self, size): + if self._read is None: + return self.file.read(size) + if self._read + size <= self._length: + read = self._read + self._read += size + return self._buffer[read:self._read] + else: + read = self._read + self._read = None + return self._buffer[read:] + \ + self.file.read(size-self._length+read) + + def prepend(self, prepend=b'', readprevious=False): + if self._read is None: + self._buffer = prepend + elif readprevious and len(prepend) <= self._read: + self._read -= len(prepend) + return + else: + self._buffer = self._buffer[read:] + prepend + self._length = len(self._buffer) + self._read = 0 + + def unused(self): + if self._read is None: + return b'' + return self._buffer[self._read:] + + def seek(self, offset, whence=0): + # This is only ever called with offset=whence=0 + if whence == 1 and self._read is not None: + if 0 <= offset + self._read <= self._length: + self._read += offset + return + else: + offset += self._length - self._read + self._read = None + self._buffer = None + return self.file.seek(offset, whence) + + def __getattr__(self, name): + return getattr(self.file, name) + + +class GzipFile(io.BufferedIOBase): """The GzipFile class simulates most of the methods of a file object with the exception of the readinto() and truncate() methods. + This class only supports opening files in binary mode. If you need to open a + compressed file in text mode, wrap your GzipFile with an io.TextIOWrapper. + """ myfileobj = None @@ -74,12 +134,13 @@ class GzipFile: The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb', depending on whether the file will be read or written. The default is the mode of fileobj if discernible; otherwise, the default is 'rb'. - Be aware that only the 'rb', 'ab', and 'wb' values should be used - for cross-platform portability. + A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and + 'wb', and 'a' and 'ab'. - The compresslevel argument is an integer from 1 to 9 controlling the + The compresslevel argument is an integer from 0 to 9 controlling the level of compression; 1 is fastest and produces the least compression, - and 9 is slowest and produces the most compression. The default is 9. + and 9 is slowest and produces the most compression. 0 is no compression + at all. The default is 9. The mtime argument is an optional numeric timestamp to be written to the stream when compressing. All gzip compressed streams @@ -92,15 +153,16 @@ class GzipFile: """ - # guarantee the file is opened in binary mode on platforms - # that care about that sort of thing + if mode and ('t' in mode or 'U' in mode): + raise IOError("Mode " + mode + " not supported") if mode and 'b' not in mode: mode += 'b' if fileobj is None: fileobj = self.myfileobj = builtins.open(filename, mode or 'rb') if filename is None: - if hasattr(fileobj, 'name'): filename = fileobj.name - else: filename = '' + filename = getattr(fileobj, 'name', '') + if not isinstance(filename, (str, bytes)): + filename = '' if mode is None: if hasattr(fileobj, 'mode'): mode = fileobj.mode else: mode = 'rb' @@ -109,11 +171,16 @@ class GzipFile: self.mode = READ # Set flag indicating start of a new member self._new_member = True + # Buffer data read from gzip file. extrastart is offset in + # stream where buffer starts. extrasize is number of + # bytes remaining in buffer from current stream position. self.extrabuf = b"" self.extrasize = 0 + self.extrastart = 0 self.name = filename # Starts small, scales exponentially self.min_readsize = 100 + fileobj = _PaddedFile(fileobj) elif mode[0:1] == 'w' or mode[0:1] == 'a': self.mode = WRITE @@ -129,7 +196,6 @@ class GzipFile: self.fileobj = fileobj self.offset = 0 self.mtime = mtime - self.closed = False if self.mode == WRITE: self._write_gzip_header() @@ -143,7 +209,10 @@ class GzipFile: return self.name def __repr__(self): - s = repr(self.fileobj) + fileobj = self.fileobj + if isinstance(fileobj, _PaddedFile): + fileobj = fileobj.file + s = repr(fileobj) return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' def _check_closed(self): @@ -166,7 +235,9 @@ class GzipFile: try: # RFC 1952 requires the FNAME field to be Latin-1. Do not # include filenames that cannot be represented that way. - fname = self.name.encode('latin-1') + fname = os.path.basename(self.name) + if not isinstance(fname, bytes): + fname = fname.encode('latin-1') if fname.endswith(b'.gz'): fname = fname[:-3] except UnicodeEncodeError: @@ -190,6 +261,9 @@ class GzipFile: def _read_gzip_header(self): magic = self.fileobj.read(2) + if magic == b'': + raise EOFError("Reached EOF") + if magic != b'\037\213': raise IOError('Not a gzipped file') method = ord( self.fileobj.read(1) ) @@ -221,6 +295,10 @@ class GzipFile: if flag & FHCRC: self.fileobj.read(2) # Read & discard the 16-bit header CRC + unused = self.fileobj.unused() + if unused: + uncompress = self.decompress.decompress(unused) + self._add_read_data(uncompress) def write(self,data): self._check_closed() @@ -230,12 +308,19 @@ class GzipFile: if self.fileobj is None: raise ValueError("write() on closed GzipFile object") + + # Convert data type if called by io.BufferedWriter. + if isinstance(data, memoryview): + data = data.tobytes() + if len(data) > 0: self.size = self.size + len(data) self.crc = zlib.crc32(data, self.crc) & 0xffffffff self.fileobj.write( self.compress.compress(data) ) self.offset += len(data) + return len(data) + def read(self, size=-1): self._check_closed() if self.mode != READ: @@ -262,15 +347,38 @@ class GzipFile: if size > self.extrasize: size = self.extrasize - chunk = self.extrabuf[:size] - self.extrabuf = self.extrabuf[size:] + offset = self.offset - self.extrastart + chunk = self.extrabuf[offset: offset + size] self.extrasize = self.extrasize - size self.offset += size return chunk + def peek(self, n): + if self.mode != READ: + import errno + raise IOError(errno.EBADF, "peek() on write-only GzipFile object") + + # Do not return ridiculously small buffers, for one common idiom + # is to call peek(1) and expect more bytes in return. + if n < 100: + n = 100 + if self.extrasize == 0: + if self.fileobj is None: + return b'' + try: + # Ensure that we don't return b"" if we haven't reached EOF. + while self.extrasize == 0: + # 1024 is the same buffering heuristic used in read() + self._read(max(n, 1024)) + except EOFError: + pass + offset = self.offset - self.extrastart + remaining = self.extrasize + assert remaining == len(self.extrabuf) - offset + return self.extrabuf[offset:offset + n] + def _unread(self, buf): - self.extrabuf = buf + self.extrabuf self.extrasize = len(buf) + self.extrasize self.offset -= len(buf) @@ -281,16 +389,6 @@ class GzipFile: if self._new_member: # If the _new_member flag is set, we have to # jump to the next member, if there is one. - # - # First, check if we're at the end of the file; - # if so, it's time to stop; no more members to read. - pos = self.fileobj.tell() # Save current position - self.fileobj.seek(0, 2) # Seek to end of file - if pos == self.fileobj.tell(): - raise EOFError("Reached EOF") - else: - self.fileobj.seek( pos ) # Return to original position - self._init_read() self._read_gzip_header() self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) @@ -304,6 +402,9 @@ class GzipFile: if buf == b"": uncompress = self.decompress.flush() + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() + self.fileobj.prepend(self.decompress.unused_data, True) self._read_eof() self._add_read_data( uncompress ) raise EOFError('Reached EOF') @@ -315,10 +416,9 @@ class GzipFile: # Ending case: we've come to the end of a member in the file, # so seek back to the start of the unused data, finish up # this member, and read a new gzip header. - # (The number of bytes to seek back is the length of the unused - # data, minus 8 because _read_eof() will rewind a further 8 bytes) - self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) - + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() and _read_gzip_header() + self.fileobj.prepend(self.decompress.unused_data, True) # Check the CRC and file size, and set the flag so we read # a new member on the next call self._read_eof() @@ -326,17 +426,17 @@ class GzipFile: def _add_read_data(self, data): self.crc = zlib.crc32(data, self.crc) & 0xffffffff - self.extrabuf = self.extrabuf + data + offset = self.offset - self.extrastart + self.extrabuf = self.extrabuf[offset:] + data self.extrasize = self.extrasize + len(data) + self.extrastart = self.offset self.size = self.size + len(data) def _read_eof(self): - # We've read to the end of the file, so we have to rewind in order - # to reread the 8 bytes containing the CRC and the file size. + # We've read to the end of the file # We check the that the computed CRC and size of the # uncompressed data matches the stored values. Note that the size # stored is the true file size mod 2**32. - self.fileobj.seek(-8, 1) crc32 = read32(self.fileobj) isize = read32(self.fileobj) # may exceed 2GB if crc32 != self.crc: @@ -345,6 +445,19 @@ class GzipFile: elif isize != (self.size & 0xffffffff): raise IOError("Incorrect length of data produced") + # Gzip files can be padded with zeroes and still have archives. + # Consume all zero bytes and set the file position to the first + # non-zero byte. See http://www.gzip.org/#faq8 + c = b"\x00" + while c == b"\x00": + c = self.fileobj.read(1) + if c: + self.fileobj.prepend(c, True) + + @property + def closed(self): + return self.fileobj is None + def close(self): if self.fileobj is None: return @@ -359,16 +472,6 @@ class GzipFile: if self.myfileobj: self.myfileobj.close() self.myfileobj = None - self.closed = True - - def __del__(self): - try: - if (self.myfileobj is None and - self.fileobj is None): - return - except AttributeError: - return - self.close() def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): self._check_closed() @@ -385,13 +488,6 @@ class GzipFile: """ return self.fileobj.fileno() - def isatty(self): - return False - - def tell(self): - self._check_closed() - return self.offset - def rewind(self): '''Return the uncompressed stream file position indicator to the beginning of the file''' @@ -401,8 +497,18 @@ class GzipFile: self._new_member = True self.extrabuf = b"" self.extrasize = 0 + self.extrastart = 0 self.offset = 0 + def readable(self): + return self.mode == READ + + def writable(self): + return self.mode == WRITE + + def seekable(self): + return True + def seek(self, offset, whence=0): if whence: if whence == 1: @@ -426,8 +532,18 @@ class GzipFile: self.read(1024) self.read(count % 1024) + return self.offset + def readline(self, size=-1): if size < 0: + # Shortcut common case - newline found in buffer. + offset = self.offset - self.extrastart + i = self.extrabuf.find(b'\n', offset) + 1 + if i > 0: + self.extrasize -= i - offset + self.offset += i - offset + return self.extrabuf[offset: i] + size = sys.maxsize readsize = self.min_readsize else: @@ -457,41 +573,22 @@ class GzipFile: self.min_readsize = min(readsize, self.min_readsize * 2, 512) return b''.join(bufs) # Return resulting line - def readlines(self, sizehint=0): - # Negative numbers result in reading all the lines - if sizehint <= 0: - sizehint = sys.maxsize - L = [] - while sizehint > 0: - line = self.readline() - if line == b"": - break - L.append(line) - sizehint = sizehint - len(line) - - return L - - def writelines(self, L): - for line in L: - self.write(line) - - def __iter__(self): - return self - def __next__(self): - line = self.readline() - if line: - return line - else: - raise StopIteration - - def __enter__(self): - if self.fileobj is None: - raise ValueError("I/O operation on closed GzipFile object") - return self - - def __exit__(self, *args): - self.close() +def compress(data, compresslevel=9): + """Compress data in one shot and return the compressed string. + Optional argument is the compression level, in range of 0-9. + """ + buf = io.BytesIO() + with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f: + f.write(data) + return buf.getvalue() + +def decompress(data): + """Decompress a gzip compressed string in one shot. + Return the decompressed string. + """ + with GzipFile(fileobj=io.BytesIO(data)) as f: + return f.read() def _test(): @@ -527,9 +624,9 @@ def _test(): if not chunk: break g.write(chunk) - if g is not sys.stdout: + if g is not sys.stdout.buffer: g.close() - if f is not sys.stdin: + if f is not sys.stdin.buffer: f.close() if __name__ == '__main__': |