1 files changed, 141 insertions, 34 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 4cfbbe05d..6083312de 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -139,6 +139,7 @@ from __future__ import division, absolute_import, print_function
 import numpy
 import sys
 import io
+import warnings
 from numpy.lib.utils import safe_eval
 from numpy.compat import asbytes, isfileobj, long, basestring
 
@@ -151,6 +152,14 @@ MAGIC_PREFIX = asbytes('\x93NUMPY')
 MAGIC_LEN = len(MAGIC_PREFIX) + 2
 BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes
 
+# difference between version 1.0 and 2.0 is a 4 byte (I) header length
+# instead of 2 bytes (H) allowing storage of large structured arrays
+
+def _check_version(version):
+    if version not in [(1, 0), (2, 0), None]:
+        msg = "we only support format version (1,0) and (2, 0), not %s"
+        raise ValueError(msg % (version,))
+
 def magic(major, minor):
     """ Return the magic string for the given file format version.
 
@@ -258,8 +267,8 @@ def header_data_from_array_1_0(array):
     d['descr'] = dtype_to_descr(array.dtype)
     return d
 
-def write_array_header_1_0(fp, d):
-    """ Write the header for an array using the 1.0 format.
+def _write_array_header(fp, d, version=None):
+    """ Write the header for an array and returns the version used
 
     Parameters
     ----------
@@ -267,6 +276,14 @@ def write_array_header_1_0(fp, d):
     d : dict
         This has the appropriate entries for writing its string representation
         to the header of the file.
+    version: tuple or None
+        None means use oldest that works
+        explicit version will raise a ValueError if the format does not
+        allow saving this data.  Default: None
+    Returns
+    -------
+    version : tuple of int
+        the file version which needs to be used to store the data
     """
     import struct
     header = ["{"]
@@ -282,11 +299,52 @@ def write_array_header_1_0(fp, d):
     current_header_len = MAGIC_LEN + 2 + len(header) + 1  # 1 for the newline
     topad = 16 - (current_header_len % 16)
     header = asbytes(header + ' '*topad + '\n')
-    if len(header) >= (256*256):
-        raise ValueError("header does not fit inside %s bytes" % (256*256))
-    header_len_str = struct.pack('<H', len(header))
+
+    if len(header) >= (256*256) and version == (1, 0):
+        raise ValueError("header does not fit inside %s bytes required by the"
+                         " 1.0 format" % (256*256))
+    if len(header) < (256*256):
+        header_len_str = struct.pack('<H', len(header))
+        version = (1, 0)
+    elif len(header) < (2**32):
+        header_len_str = struct.pack('<I', len(header))
+        version = (2, 0)
+    else:
+        raise ValueError("header does not fit inside 4 GiB required by "
+                         "the 2.0 format")
+
+    fp.write(magic(*version))
     fp.write(header_len_str)
     fp.write(header)
+    return version
+
+def write_array_header_1_0(fp, d):
+    """ Write the header for an array using the 1.0 format.
+
+    Parameters
+    ----------
+    fp : filelike object
+    d : dict
+        This has the appropriate entries for writing its string representation
+        to the header of the file.
+    """
+    _write_array_header(fp, d, (1, 0))
+
+
+def write_array_header_2_0(fp, d):
+    """ Write the header for an array using the 2.0 format.
+        The 2.0 format allows storing very large structured arrays.
+
+    .. versionadded:: 1.9.0
+
+    Parameters
+    ----------
+    fp : filelike object
+    d : dict
+        This has the appropriate entries for writing its string representation
+        to the header of the file.
+    """
+    _write_array_header(fp, d, (2, 0))
 
 def read_array_header_1_0(fp):
     """
@@ -317,12 +375,58 @@ def read_array_header_1_0(fp):
         If the data is invalid.
 
     """
+    _read_array_header(fp, version=(1, 0))
+
+def read_array_header_2_0(fp):
+    """
+    Read an array header from a filelike object using the 2.0 file format
+    version.
+
+    This will leave the file object located just after the header.
+
+    .. versionadded:: 1.9.0
+
+    Parameters
+    ----------
+    fp : filelike object
+        A file object or something with a `.read()` method like a file.
+
+    Returns
+    -------
+    shape : tuple of int
+        The shape of the array.
+    fortran_order : bool
+        The array data will be written out directly if it is either C-contiguous
+        or Fortran-contiguous. Otherwise, it will be made contiguous before
+        writing it out.
+    dtype : dtype
+        The dtype of the file's data.
+
+    Raises
+    ------
+    ValueError
+        If the data is invalid.
+
+    """
+    _read_array_header(fp, version=(2, 0))
+
+def _read_array_header(fp, version):
+    """
+    see read_array_header_1_0
+    """
     # Read an unsigned, little-endian short int which has the length of the
     # header.
     import struct
-    hlength_str = _read_bytes(fp, 2, "array header length")
-    header_length = struct.unpack('<H', hlength_str)[0]
-    header = _read_bytes(fp, header_length, "array header")
+    if version == (1, 0):
+        hlength_str = _read_bytes(fp, 2, "array header length")
+        header_length = struct.unpack('<H', hlength_str)[0]
+        header = _read_bytes(fp, header_length, "array header")
+    elif version == (2, 0):
+        hlength_str = _read_bytes(fp, 4, "array header length")
+        header_length = struct.unpack('<I', hlength_str)[0]
+        header = _read_bytes(fp, header_length, "array header")
+    else:
+        raise ValueError("Invalid version %r" % version)
 
     # The header is a pretty-printed string representation of a literal Python
     # dictionary with trailing newlines padded to a 16-byte boundary. The keys
@@ -359,7 +463,7 @@ def read_array_header_1_0(fp):
 
     return d['shape'], d['fortran_order'], dtype
 
-def write_array(fp, array, version=(1, 0)):
+def write_array(fp, array, version=None):
     """
     Write an array to an NPY file, including a header.
 
@@ -374,8 +478,9 @@ def write_array(fp, array, version=(1, 0)):
         method.
     array : ndarray
         The array to write to disk.
-    version : (int, int), optional
-        The version number of the format.  Default: (1, 0)
+    version : (int, int) or None, optional
+        The version number of the format. None means use the oldest supported
+        version that is able to store the data.  Default: None
 
     Raises
     ------
@@ -387,11 +492,13 @@ def write_array(fp, array, version=(1, 0)):
         are not picklable.
 
     """
-    if version != (1, 0):
-        msg = "we only support format version (1,0), not %s"
-        raise ValueError(msg % (version,))
-    fp.write(magic(*version))
-    write_array_header_1_0(fp, header_data_from_array_1_0(array))
+    _check_version(version)
+    used_ver = _write_array_header(fp, header_data_from_array_1_0(array),
+                                   version)
+    # this warning can be removed when 1.9 has aged enough
+    if version != (2, 0) and used_ver == (2, 0):
+        warnings.warn("Stored array in format 2.0. It can only be"
+                      "read by NumPy >= 1.9", UserWarning)
 
     # Set buffer size to 16 MiB to hide the Python loop overhead.
     buffersize = max(16 * 1024 ** 2 // array.itemsize, 1)
@@ -407,7 +514,7 @@ def write_array(fp, array, version=(1, 0)):
             for chunk in numpy.nditer(
                     array, flags=['external_loop', 'buffered', 'zerosize_ok'],
                     buffersize=buffersize, order='F'):
-                fp.write(chunk.tostring('C'))
+                fp.write(chunk.tobytes('C'))
     else:
         if isfileobj(fp):
             array.tofile(fp)
@@ -415,7 +522,7 @@ def write_array(fp, array, version=(1, 0)):
             for chunk in numpy.nditer(
                     array, flags=['external_loop', 'buffered', 'zerosize_ok'],
                     buffersize=buffersize, order='C'):
-                fp.write(chunk.tostring('C'))
+                fp.write(chunk.tobytes('C'))
 
 
 def read_array(fp):
@@ -440,10 +547,8 @@ def read_array(fp):
 
     """
     version = read_magic(fp)
-    if version != (1, 0):
-        msg = "only support version (1,0) of file format, not %r"
-        raise ValueError(msg % (version,))
-    shape, fortran_order, dtype = read_array_header_1_0(fp)
+    _check_version(version)
+    shape, fortran_order, dtype = _read_array_header(fp, version)
     if len(shape) == 0:
         count = 1
     else:
@@ -486,7 +591,7 @@ def read_array(fp):
 
 
 def open_memmap(filename, mode='r+', dtype=None, shape=None,
-                fortran_order=False, version=(1, 0)):
+                fortran_order=False, version=None):
     """
     Open a .npy file as a memory-mapped array.
 
@@ -513,9 +618,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
         Whether the array should be Fortran-contiguous (True) or
         C-contiguous (False, the default) if we are creating a new file
         in "write" mode.
-    version : tuple of int (major, minor)
+    version : tuple of int (major, minor) or None
         If the mode is a "write" mode, then this is the version of the file
-        format used to create the file.  Default: (1,0)
+        format used to create the file.
+        None means use the oldest supported version that is able to store the
+        data.  Default: None
 
     Returns
     -------
@@ -541,9 +648,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
     if 'w' in mode:
         # We are creating the file, not reading it.
         # Check if we ought to create the file.
-        if version != (1, 0):
-            msg = "only support version (1,0) of file format, not %r"
-            raise ValueError(msg % (version,))
+        _check_version(version)
         # Ensure that the given dtype is an authentic dtype object rather than
         # just something that can be interpreted as a dtype object.
         dtype = numpy.dtype(dtype)
@@ -558,8 +663,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
         # If we got here, then it should be safe to create the file.
         fp = open(filename, mode+'b')
         try:
-            fp.write(magic(*version))
-            write_array_header_1_0(fp, d)
+            used_ver = _write_array_header(fp, d, version)
+            # this warning can be removed when 1.9 has aged enough
+            if version != (2, 0) and used_ver == (2, 0):
+                warnings.warn("Stored array in format 2.0. It can only be"
+                              "read by NumPy >= 1.9", UserWarning)
             offset = fp.tell()
         finally:
             fp.close()
@@ -568,10 +676,9 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
         fp = open(filename, 'rb')
         try:
             version = read_magic(fp)
-            if version != (1, 0):
-                msg = "only support version (1,0) of file format, not %r"
-                raise ValueError(msg % (version,))
-            shape, fortran_order, dtype = read_array_header_1_0(fp)
+            _check_version(version)
+
+            shape, fortran_order, dtype = _read_array_header(fp, version)
             if dtype.hasobject:
                 msg = "Array can't be memory-mapped: Python objects in dtype."
                 raise ValueError(msg)