summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2014-06-02 14:07:29 -0600
committerCharles Harris <charlesr.harris@gmail.com>2014-06-02 14:07:29 -0600
commite6f43660b156438b0ad4f10b4c8503ba478c0cdd (patch)
tree17924dd5cf88a9d79ce244bc5bacebe8c9d5248b
parent5e7e24e3959e5b44218751f016b5b912e2b9c7fa (diff)
parentfc50d27b4e125052d44b0643bc344e4c5062316a (diff)
downloadnumpy-e6f43660b156438b0ad4f10b4c8503ba478c0cdd.tar.gz
Merge pull request #4765 from juliantaylor/npyformat-2.0
ENH: add storage format 2.0 with 4 byte header size
-rw-r--r--doc/release/1.9.0-notes.rst8
-rw-r--r--numpy/lib/format.py171
-rw-r--r--numpy/lib/tests/test_format.py57
3 files changed, 202 insertions, 34 deletions
diff --git a/doc/release/1.9.0-notes.rst b/doc/release/1.9.0-notes.rst
index 88bff1212..29b2703b0 100644
--- a/doc/release/1.9.0-notes.rst
+++ b/doc/release/1.9.0-notes.rst
@@ -196,6 +196,14 @@ comparison when the numpy version goes to 1.10.devel. For example::
>>> if NumpyVersion(np.__version__) < '1.10.0'):
... print('Wow, that is an old NumPy version!')
+Allow saving arrays with large number of named columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The numpy storage format 1.0 only allowed the array header to have a total size
+of 65535 bytes. This can be exceeded by structured arrays with a large number
+of columns. A new format 2.0 has been added which extends the header size to 4
+GiB. `np.save` will automatically save in 2.0 format if the data requires it,
+else it will always use the more compatible 1.0 format.
+
Improvements
============
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 631e92959..6083312de 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -139,6 +139,7 @@ from __future__ import division, absolute_import, print_function
import numpy
import sys
import io
+import warnings
from numpy.lib.utils import safe_eval
from numpy.compat import asbytes, isfileobj, long, basestring
@@ -151,6 +152,14 @@ MAGIC_PREFIX = asbytes('\x93NUMPY')
MAGIC_LEN = len(MAGIC_PREFIX) + 2
BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes
+# difference between version 1.0 and 2.0 is a 4 byte (I) header length
+# instead of 2 bytes (H) allowing storage of large structured arrays
+
+def _check_version(version):
+ if version not in [(1, 0), (2, 0), None]:
+ msg = "we only support format version (1,0) and (2, 0), not %s"
+ raise ValueError(msg % (version,))
+
def magic(major, minor):
""" Return the magic string for the given file format version.
@@ -258,8 +267,8 @@ def header_data_from_array_1_0(array):
d['descr'] = dtype_to_descr(array.dtype)
return d
-def write_array_header_1_0(fp, d):
- """ Write the header for an array using the 1.0 format.
+def _write_array_header(fp, d, version=None):
+ """ Write the header for an array and returns the version used
Parameters
----------
@@ -267,6 +276,14 @@ def write_array_header_1_0(fp, d):
d : dict
This has the appropriate entries for writing its string representation
to the header of the file.
+ version: tuple or None
+ None means use oldest that works
+ explicit version will raise a ValueError if the format does not
+ allow saving this data. Default: None
+ Returns
+ -------
+ version : tuple of int
+ the file version which needs to be used to store the data
"""
import struct
header = ["{"]
@@ -282,11 +299,52 @@ def write_array_header_1_0(fp, d):
current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline
topad = 16 - (current_header_len % 16)
header = asbytes(header + ' '*topad + '\n')
- if len(header) >= (256*256):
- raise ValueError("header does not fit inside %s bytes" % (256*256))
- header_len_str = struct.pack('<H', len(header))
+
+ if len(header) >= (256*256) and version == (1, 0):
+ raise ValueError("header does not fit inside %s bytes required by the"
+ " 1.0 format" % (256*256))
+ if len(header) < (256*256):
+ header_len_str = struct.pack('<H', len(header))
+ version = (1, 0)
+ elif len(header) < (2**32):
+ header_len_str = struct.pack('<I', len(header))
+ version = (2, 0)
+ else:
+ raise ValueError("header does not fit inside 4 GiB required by "
+ "the 2.0 format")
+
+ fp.write(magic(*version))
fp.write(header_len_str)
fp.write(header)
+ return version
+
+def write_array_header_1_0(fp, d):
+ """ Write the header for an array using the 1.0 format.
+
+ Parameters
+ ----------
+ fp : filelike object
+ d : dict
+ This has the appropriate entries for writing its string representation
+ to the header of the file.
+ """
+ _write_array_header(fp, d, (1, 0))
+
+
+def write_array_header_2_0(fp, d):
+ """ Write the header for an array using the 2.0 format.
+ The 2.0 format allows storing very large structured arrays.
+
+ .. versionadded:: 1.9.0
+
+ Parameters
+ ----------
+ fp : filelike object
+ d : dict
+ This has the appropriate entries for writing its string representation
+ to the header of the file.
+ """
+ _write_array_header(fp, d, (2, 0))
def read_array_header_1_0(fp):
"""
@@ -317,12 +375,58 @@ def read_array_header_1_0(fp):
If the data is invalid.
"""
+ _read_array_header(fp, version=(1, 0))
+
+def read_array_header_2_0(fp):
+ """
+ Read an array header from a filelike object using the 2.0 file format
+ version.
+
+ This will leave the file object located just after the header.
+
+ .. versionadded:: 1.9.0
+
+ Parameters
+ ----------
+ fp : filelike object
+ A file object or something with a `.read()` method like a file.
+
+ Returns
+ -------
+ shape : tuple of int
+ The shape of the array.
+ fortran_order : bool
+ The array data will be written out directly if it is either C-contiguous
+ or Fortran-contiguous. Otherwise, it will be made contiguous before
+ writing it out.
+ dtype : dtype
+ The dtype of the file's data.
+
+ Raises
+ ------
+ ValueError
+ If the data is invalid.
+
+ """
+ _read_array_header(fp, version=(2, 0))
+
+def _read_array_header(fp, version):
+ """
+ see read_array_header_1_0
+ """
# Read an unsigned, little-endian short int which has the length of the
# header.
import struct
- hlength_str = _read_bytes(fp, 2, "array header length")
- header_length = struct.unpack('<H', hlength_str)[0]
- header = _read_bytes(fp, header_length, "array header")
+ if version == (1, 0):
+ hlength_str = _read_bytes(fp, 2, "array header length")
+ header_length = struct.unpack('<H', hlength_str)[0]
+ header = _read_bytes(fp, header_length, "array header")
+ elif version == (2, 0):
+ hlength_str = _read_bytes(fp, 4, "array header length")
+ header_length = struct.unpack('<I', hlength_str)[0]
+ header = _read_bytes(fp, header_length, "array header")
+ else:
+ raise ValueError("Invalid version %r" % version)
# The header is a pretty-printed string representation of a literal Python
# dictionary with trailing newlines padded to a 16-byte boundary. The keys
@@ -359,7 +463,7 @@ def read_array_header_1_0(fp):
return d['shape'], d['fortran_order'], dtype
-def write_array(fp, array, version=(1, 0)):
+def write_array(fp, array, version=None):
"""
Write an array to an NPY file, including a header.
@@ -374,8 +478,9 @@ def write_array(fp, array, version=(1, 0)):
method.
array : ndarray
The array to write to disk.
- version : (int, int), optional
- The version number of the format. Default: (1, 0)
+ version : (int, int) or None, optional
+ The version number of the format. None means use the oldest supported
+ version that is able to store the data. Default: None
Raises
------
@@ -387,11 +492,13 @@ def write_array(fp, array, version=(1, 0)):
are not picklable.
"""
- if version != (1, 0):
- msg = "we only support format version (1,0), not %s"
- raise ValueError(msg % (version,))
- fp.write(magic(*version))
- write_array_header_1_0(fp, header_data_from_array_1_0(array))
+ _check_version(version)
+ used_ver = _write_array_header(fp, header_data_from_array_1_0(array),
+ version)
+ # this warning can be removed when 1.9 has aged enough
+ if version != (2, 0) and used_ver == (2, 0):
+ warnings.warn("Stored array in format 2.0. It can only be"
+ "read by NumPy >= 1.9", UserWarning)
# Set buffer size to 16 MiB to hide the Python loop overhead.
buffersize = max(16 * 1024 ** 2 // array.itemsize, 1)
@@ -440,10 +547,8 @@ def read_array(fp):
"""
version = read_magic(fp)
- if version != (1, 0):
- msg = "only support version (1,0) of file format, not %r"
- raise ValueError(msg % (version,))
- shape, fortran_order, dtype = read_array_header_1_0(fp)
+ _check_version(version)
+ shape, fortran_order, dtype = _read_array_header(fp, version)
if len(shape) == 0:
count = 1
else:
@@ -486,7 +591,7 @@ def read_array(fp):
def open_memmap(filename, mode='r+', dtype=None, shape=None,
- fortran_order=False, version=(1, 0)):
+ fortran_order=False, version=None):
"""
Open a .npy file as a memory-mapped array.
@@ -513,9 +618,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
Whether the array should be Fortran-contiguous (True) or
C-contiguous (False, the default) if we are creating a new file
in "write" mode.
- version : tuple of int (major, minor)
+ version : tuple of int (major, minor) or None
If the mode is a "write" mode, then this is the version of the file
- format used to create the file. Default: (1,0)
+ format used to create the file.
+ None means use the oldest supported version that is able to store the
+ data. Default: None
Returns
-------
@@ -541,9 +648,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
if 'w' in mode:
# We are creating the file, not reading it.
# Check if we ought to create the file.
- if version != (1, 0):
- msg = "only support version (1,0) of file format, not %r"
- raise ValueError(msg % (version,))
+ _check_version(version)
# Ensure that the given dtype is an authentic dtype object rather than
# just something that can be interpreted as a dtype object.
dtype = numpy.dtype(dtype)
@@ -558,8 +663,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
# If we got here, then it should be safe to create the file.
fp = open(filename, mode+'b')
try:
- fp.write(magic(*version))
- write_array_header_1_0(fp, d)
+ used_ver = _write_array_header(fp, d, version)
+ # this warning can be removed when 1.9 has aged enough
+ if version != (2, 0) and used_ver == (2, 0):
+ warnings.warn("Stored array in format 2.0. It can only be"
+ "read by NumPy >= 1.9", UserWarning)
offset = fp.tell()
finally:
fp.close()
@@ -568,10 +676,9 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
fp = open(filename, 'rb')
try:
version = read_magic(fp)
- if version != (1, 0):
- msg = "only support version (1,0) of file format, not %r"
- raise ValueError(msg % (version,))
- shape, fortran_order, dtype = read_array_header_1_0(fp)
+ _check_version(version)
+
+ shape, fortran_order, dtype = _read_array_header(fp, version)
if dtype.hasobject:
msg = "Array can't be memory-mapped: Python objects in dtype."
raise ValueError(msg)
diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py
index c294637ad..1034b5125 100644
--- a/numpy/lib/tests/test_format.py
+++ b/numpy/lib/tests/test_format.py
@@ -280,6 +280,7 @@ import sys
import os
import shutil
import tempfile
+import warnings
from io import BytesIO
import numpy as np
@@ -521,19 +522,71 @@ def test_compressed_roundtrip():
assert_array_equal(arr, arr1)
-def test_write_version_1_0():
+def test_version_2_0():
+ f = BytesIO()
+ # requires more than 2 byte for header
+ dt = [(("%d" % i) * 100, float) for i in range(500)]
+ d = np.ones(1000, dtype=dt)
+
+ format.write_array(f, d, version=(2, 0))
+ with warnings.catch_warnings(record=True) as w:
+ warnings.filterwarnings('always', '', UserWarning)
+ format.write_array(f, d)
+ assert_(w[0].category is UserWarning)
+
+ f.seek(0)
+ n = format.read_array(f)
+ assert_array_equal(d, n)
+
+ # 1.0 requested but data cannot be saved this way
+ assert_raises(ValueError, format.write_array, f, d, (1, 0))
+
+
+def test_version_2_0_memmap():
+ # requires more than 2 byte for header
+ dt = [(("%d" % i) * 100, float) for i in range(500)]
+ d = np.ones(1000, dtype=dt)
+ tf = tempfile.mktemp('', 'mmap', dir=tempdir)
+
+ # 1.0 requested but data cannot be saved this way
+ assert_raises(ValueError, format.open_memmap, tf, mode='w+', dtype=d.dtype,
+ shape=d.shape, version=(1, 0))
+
+ ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
+ shape=d.shape, version=(2, 0))
+ ma[...] = d
+ del ma
+
+ with warnings.catch_warnings(record=True) as w:
+ warnings.filterwarnings('always', '', UserWarning)
+ ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
+ shape=d.shape, version=None)
+ assert_(w[0].category is UserWarning)
+ ma[...] = d
+ del ma
+
+ ma = format.open_memmap(tf, mode='r')
+ assert_array_equal(ma, d)
+
+
+def test_write_version():
f = BytesIO()
arr = np.arange(1)
# These should pass.
format.write_array(f, arr, version=(1, 0))
format.write_array(f, arr)
+ format.write_array(f, arr, version=None)
+ format.write_array(f, arr)
+
+ format.write_array(f, arr, version=(2, 0))
+ format.write_array(f, arr)
+
# These should all fail.
bad_versions = [
(1, 1),
(0, 0),
(0, 1),
- (2, 0),
(2, 2),
(255, 255),
]