diff options
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r-- | numpy/lib/format.py | 20 |
1 files changed, 17 insertions, 3 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 81e8cd010..ff3b95d6e 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -148,6 +148,7 @@ else: MAGIC_PREFIX = asbytes('\x93NUMPY') MAGIC_LEN = len(MAGIC_PREFIX) + 2 +BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes def magic(major, minor): """ Return the magic string for the given file format version. @@ -457,9 +458,22 @@ def read_array(fp): else: # This is not a real file. We have to read it the memory-intensive # way. - # XXX: we can probably chunk this to avoid the memory hit. - data = fp.read(int(count * dtype.itemsize)) - array = numpy.fromstring(data, dtype=dtype, count=count) + # crc32 module fails on reads greater than 2 ** 32 bytes, breaking + # large reads from gzip streams. Chunk reads to BUFFER_SIZE bytes to + # avoid issue and reduce memory overhead of the read. In + # non-chunked case count < max_read_count, so only one read is + # performed. + + max_read_count = BUFFER_SIZE // dtype.itemsize + + array = numpy.empty(count, dtype=dtype) + + for i in range(0, count, max_read_count): + read_count = min(max_read_count, count - i) + + data = fp.read(int(read_count * dtype.itemsize)) + array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype, + count=read_count) if fortran_order: array.shape = shape[::-1] |