summaryrefslogtreecommitdiff
path: root/numpy/lib/format.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r--numpy/lib/format.py43
1 files changed, 30 insertions, 13 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 1e508f3e5..ff3b95d6e 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -134,16 +134,21 @@ The ``.npy`` format, including reasons for creating it and a comparison of
alternatives, is described fully in the "npy-format" NEP.
"""
-
-import cPickle
+from __future__ import division, absolute_import, print_function
import numpy
import sys
from numpy.lib.utils import safe_eval
-from numpy.compat import asbytes, isfileobj
+from numpy.compat import asbytes, isfileobj, long, basestring
+
+if sys.version_info[0] >= 3:
+ import pickle
+else:
+ import cPickle as pickle
MAGIC_PREFIX = asbytes('\x93NUMPY')
MAGIC_LEN = len(MAGIC_PREFIX) + 2
+BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes
def magic(major, minor):
""" Return the magic string for the given file format version.
@@ -310,7 +315,7 @@ def read_array_header_1_0(fp):
Raises
------
- ValueError :
+ ValueError
If the data is invalid.
"""
@@ -334,14 +339,13 @@ def read_array_header_1_0(fp):
# "descr" : dtype.descr
try:
d = safe_eval(header)
- except SyntaxError, e:
+ except SyntaxError as e:
msg = "Cannot parse header: %r\nException: %r"
raise ValueError(msg % (header, e))
if not isinstance(d, dict):
msg = "Header is not a dictionary: %r"
raise ValueError(msg % d)
- keys = d.keys()
- keys.sort()
+ keys = sorted(d.keys())
if keys != ['descr', 'fortran_order', 'shape']:
msg = "Header does not contain the correct keys: %r"
raise ValueError(msg % (keys,))
@@ -356,7 +360,7 @@ def read_array_header_1_0(fp):
raise ValueError(msg % (d['fortran_order'],))
try:
dtype = numpy.dtype(d['descr'])
- except TypeError, e:
+ except TypeError as e:
msg = "descr is not a valid dtype descriptor: %r"
raise ValueError(msg % (d['descr'],))
@@ -398,7 +402,7 @@ def write_array(fp, array, version=(1,0)):
if array.dtype.hasobject:
# We contain Python objects so we cannot write out the data directly.
# Instead, we will pickle it out with version 2 of the pickle protocol.
- cPickle.dump(array, fp, protocol=2)
+ pickle.dump(array, fp, protocol=2)
elif array.flags.f_contiguous and not array.flags.c_contiguous:
if isfileobj(fp):
array.T.tofile(fp)
@@ -446,7 +450,7 @@ def read_array(fp):
# Now read the actual data.
if dtype.hasobject:
# The array contained Python objects. We need to unpickle the data.
- array = cPickle.load(fp)
+ array = pickle.load(fp)
else:
if isfileobj(fp):
# We can use the fast fromfile() function.
@@ -454,9 +458,22 @@ def read_array(fp):
else:
# This is not a real file. We have to read it the memory-intensive
# way.
- # XXX: we can probably chunk this to avoid the memory hit.
- data = fp.read(int(count * dtype.itemsize))
- array = numpy.fromstring(data, dtype=dtype, count=count)
+ # crc32 module fails on reads greater than 2 ** 32 bytes, breaking
+ # large reads from gzip streams. Chunk reads to BUFFER_SIZE bytes to
+ # avoid issue and reduce memory overhead of the read. In
+ # non-chunked case count < max_read_count, so only one read is
+ # performed.
+
+ max_read_count = BUFFER_SIZE // dtype.itemsize
+
+ array = numpy.empty(count, dtype=dtype)
+
+ for i in range(0, count, max_read_count):
+ read_count = min(max_read_count, count - i)
+
+ data = fp.read(int(read_count * dtype.itemsize))
+ array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype,
+ count=read_count)
if fortran_order:
array.shape = shape[::-1]