1 files changed, 30 insertions, 13 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 1e508f3e5..ff3b95d6e 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -134,16 +134,21 @@ The ``.npy`` format, including reasons for creating it and a comparison of
 alternatives, is described fully in the "npy-format" NEP.
 
 """
-
-import cPickle
+from __future__ import division, absolute_import, print_function
 
 import numpy
 import sys
 from numpy.lib.utils import safe_eval
-from numpy.compat import asbytes, isfileobj
+from numpy.compat import asbytes, isfileobj, long, basestring
+
+if sys.version_info[0] >= 3:
+    import pickle
+else:
+    import cPickle as pickle
 
 MAGIC_PREFIX = asbytes('\x93NUMPY')
 MAGIC_LEN = len(MAGIC_PREFIX) + 2
+BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes
 
 def magic(major, minor):
     """ Return the magic string for the given file format version.
@@ -310,7 +315,7 @@ def read_array_header_1_0(fp):
 
     Raises
     ------
-    ValueError :
+    ValueError
         If the data is invalid.
 
     """
@@ -334,14 +339,13 @@ def read_array_header_1_0(fp):
     #   "descr" : dtype.descr
     try:
         d = safe_eval(header)
-    except SyntaxError, e:
+    except SyntaxError as e:
         msg = "Cannot parse header: %r\nException: %r"
         raise ValueError(msg % (header, e))
     if not isinstance(d, dict):
         msg = "Header is not a dictionary: %r"
         raise ValueError(msg % d)
-    keys = d.keys()
-    keys.sort()
+    keys = sorted(d.keys())
     if keys != ['descr', 'fortran_order', 'shape']:
         msg = "Header does not contain the correct keys: %r"
         raise ValueError(msg % (keys,))
@@ -356,7 +360,7 @@ def read_array_header_1_0(fp):
         raise ValueError(msg % (d['fortran_order'],))
     try:
         dtype = numpy.dtype(d['descr'])
-    except TypeError, e:
+    except TypeError as e:
         msg = "descr is not a valid dtype descriptor: %r"
         raise ValueError(msg % (d['descr'],))
 
@@ -398,7 +402,7 @@ def write_array(fp, array, version=(1,0)):
     if array.dtype.hasobject:
         # We contain Python objects so we cannot write out the data directly.
         # Instead, we will pickle it out with version 2 of the pickle protocol.
-        cPickle.dump(array, fp, protocol=2)
+        pickle.dump(array, fp, protocol=2)
     elif array.flags.f_contiguous and not array.flags.c_contiguous:
         if isfileobj(fp):
             array.T.tofile(fp)
@@ -446,7 +450,7 @@ def read_array(fp):
     # Now read the actual data.
     if dtype.hasobject:
         # The array contained Python objects. We need to unpickle the data.
-        array = cPickle.load(fp)
+        array = pickle.load(fp)
     else:
         if isfileobj(fp):
             # We can use the fast fromfile() function.
@@ -454,9 +458,22 @@ def read_array(fp):
         else:
             # This is not a real file. We have to read it the memory-intensive
             # way.
-            # XXX: we can probably chunk this to avoid the memory hit.
-            data = fp.read(int(count * dtype.itemsize))
-            array = numpy.fromstring(data, dtype=dtype, count=count)
+            # crc32 module fails on reads greater than 2 ** 32 bytes, breaking
+            # large reads from gzip streams. Chunk reads to BUFFER_SIZE bytes to
+            # avoid issue and reduce memory overhead of the read. In
+            # non-chunked case count < max_read_count, so only one read is
+            # performed.
+
+            max_read_count = BUFFER_SIZE // dtype.itemsize
+
+            array = numpy.empty(count, dtype=dtype)
+
+            for i in range(0, count, max_read_count):
+                read_count = min(max_read_count, count - i)
+
+                data = fp.read(int(read_count * dtype.itemsize))
+                array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype,
+                                                         count=read_count)
 
         if fortran_order:
             array.shape = shape[::-1]