diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2014-06-02 14:07:29 -0600 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2014-06-02 14:07:29 -0600 |
commit | e6f43660b156438b0ad4f10b4c8503ba478c0cdd (patch) | |
tree | 17924dd5cf88a9d79ce244bc5bacebe8c9d5248b | |
parent | 5e7e24e3959e5b44218751f016b5b912e2b9c7fa (diff) | |
parent | fc50d27b4e125052d44b0643bc344e4c5062316a (diff) | |
download | numpy-e6f43660b156438b0ad4f10b4c8503ba478c0cdd.tar.gz |
Merge pull request #4765 from juliantaylor/npyformat-2.0
ENH: add storage format 2.0 with 4 byte header size
-rw-r--r-- | doc/release/1.9.0-notes.rst | 8 | ||||
-rw-r--r-- | numpy/lib/format.py | 171 | ||||
-rw-r--r-- | numpy/lib/tests/test_format.py | 57 |
3 files changed, 202 insertions, 34 deletions
diff --git a/doc/release/1.9.0-notes.rst b/doc/release/1.9.0-notes.rst index 88bff1212..29b2703b0 100644 --- a/doc/release/1.9.0-notes.rst +++ b/doc/release/1.9.0-notes.rst @@ -196,6 +196,14 @@ comparison when the numpy version goes to 1.10.devel. For example:: >>> if NumpyVersion(np.__version__) < '1.10.0'): ... print('Wow, that is an old NumPy version!') +Allow saving arrays with large number of named columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The numpy storage format 1.0 only allowed the array header to have a total size +of 65535 bytes. This can be exceeded by structured arrays with a large number +of columns. A new format 2.0 has been added which extends the header size to 4 +GiB. `np.save` will automatically save in 2.0 format if the data requires it, +else it will always use the more compatible 1.0 format. + Improvements ============ diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 631e92959..6083312de 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -139,6 +139,7 @@ from __future__ import division, absolute_import, print_function import numpy import sys import io +import warnings from numpy.lib.utils import safe_eval from numpy.compat import asbytes, isfileobj, long, basestring @@ -151,6 +152,14 @@ MAGIC_PREFIX = asbytes('\x93NUMPY') MAGIC_LEN = len(MAGIC_PREFIX) + 2 BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes +# difference between version 1.0 and 2.0 is a 4 byte (I) header length +# instead of 2 bytes (H) allowing storage of large structured arrays + +def _check_version(version): + if version not in [(1, 0), (2, 0), None]: + msg = "we only support format version (1,0) and (2, 0), not %s" + raise ValueError(msg % (version,)) + def magic(major, minor): """ Return the magic string for the given file format version. @@ -258,8 +267,8 @@ def header_data_from_array_1_0(array): d['descr'] = dtype_to_descr(array.dtype) return d -def write_array_header_1_0(fp, d): - """ Write the header for an array using the 1.0 format. +def _write_array_header(fp, d, version=None): + """ Write the header for an array and returns the version used Parameters ---------- @@ -267,6 +276,14 @@ def write_array_header_1_0(fp, d): d : dict This has the appropriate entries for writing its string representation to the header of the file. + version: tuple or None + None means use oldest that works + explicit version will raise a ValueError if the format does not + allow saving this data. Default: None + Returns + ------- + version : tuple of int + the file version which needs to be used to store the data """ import struct header = ["{"] @@ -282,11 +299,52 @@ def write_array_header_1_0(fp, d): current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline topad = 16 - (current_header_len % 16) header = asbytes(header + ' '*topad + '\n') - if len(header) >= (256*256): - raise ValueError("header does not fit inside %s bytes" % (256*256)) - header_len_str = struct.pack('<H', len(header)) + + if len(header) >= (256*256) and version == (1, 0): + raise ValueError("header does not fit inside %s bytes required by the" + " 1.0 format" % (256*256)) + if len(header) < (256*256): + header_len_str = struct.pack('<H', len(header)) + version = (1, 0) + elif len(header) < (2**32): + header_len_str = struct.pack('<I', len(header)) + version = (2, 0) + else: + raise ValueError("header does not fit inside 4 GiB required by " + "the 2.0 format") + + fp.write(magic(*version)) fp.write(header_len_str) fp.write(header) + return version + +def write_array_header_1_0(fp, d): + """ Write the header for an array using the 1.0 format. + + Parameters + ---------- + fp : filelike object + d : dict + This has the appropriate entries for writing its string representation + to the header of the file. + """ + _write_array_header(fp, d, (1, 0)) + + +def write_array_header_2_0(fp, d): + """ Write the header for an array using the 2.0 format. + The 2.0 format allows storing very large structured arrays. + + .. versionadded:: 1.9.0 + + Parameters + ---------- + fp : filelike object + d : dict + This has the appropriate entries for writing its string representation + to the header of the file. + """ + _write_array_header(fp, d, (2, 0)) def read_array_header_1_0(fp): """ @@ -317,12 +375,58 @@ def read_array_header_1_0(fp): If the data is invalid. """ + _read_array_header(fp, version=(1, 0)) + +def read_array_header_2_0(fp): + """ + Read an array header from a filelike object using the 2.0 file format + version. + + This will leave the file object located just after the header. + + .. versionadded:: 1.9.0 + + Parameters + ---------- + fp : filelike object + A file object or something with a `.read()` method like a file. + + Returns + ------- + shape : tuple of int + The shape of the array. + fortran_order : bool + The array data will be written out directly if it is either C-contiguous + or Fortran-contiguous. Otherwise, it will be made contiguous before + writing it out. + dtype : dtype + The dtype of the file's data. + + Raises + ------ + ValueError + If the data is invalid. + + """ + _read_array_header(fp, version=(2, 0)) + +def _read_array_header(fp, version): + """ + see read_array_header_1_0 + """ # Read an unsigned, little-endian short int which has the length of the # header. import struct - hlength_str = _read_bytes(fp, 2, "array header length") - header_length = struct.unpack('<H', hlength_str)[0] - header = _read_bytes(fp, header_length, "array header") + if version == (1, 0): + hlength_str = _read_bytes(fp, 2, "array header length") + header_length = struct.unpack('<H', hlength_str)[0] + header = _read_bytes(fp, header_length, "array header") + elif version == (2, 0): + hlength_str = _read_bytes(fp, 4, "array header length") + header_length = struct.unpack('<I', hlength_str)[0] + header = _read_bytes(fp, header_length, "array header") + else: + raise ValueError("Invalid version %r" % version) # The header is a pretty-printed string representation of a literal Python # dictionary with trailing newlines padded to a 16-byte boundary. The keys @@ -359,7 +463,7 @@ def read_array_header_1_0(fp): return d['shape'], d['fortran_order'], dtype -def write_array(fp, array, version=(1, 0)): +def write_array(fp, array, version=None): """ Write an array to an NPY file, including a header. @@ -374,8 +478,9 @@ def write_array(fp, array, version=(1, 0)): method. array : ndarray The array to write to disk. - version : (int, int), optional - The version number of the format. Default: (1, 0) + version : (int, int) or None, optional + The version number of the format. None means use the oldest supported + version that is able to store the data. Default: None Raises ------ @@ -387,11 +492,13 @@ def write_array(fp, array, version=(1, 0)): are not picklable. """ - if version != (1, 0): - msg = "we only support format version (1,0), not %s" - raise ValueError(msg % (version,)) - fp.write(magic(*version)) - write_array_header_1_0(fp, header_data_from_array_1_0(array)) + _check_version(version) + used_ver = _write_array_header(fp, header_data_from_array_1_0(array), + version) + # this warning can be removed when 1.9 has aged enough + if version != (2, 0) and used_ver == (2, 0): + warnings.warn("Stored array in format 2.0. It can only be" + "read by NumPy >= 1.9", UserWarning) # Set buffer size to 16 MiB to hide the Python loop overhead. buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) @@ -440,10 +547,8 @@ def read_array(fp): """ version = read_magic(fp) - if version != (1, 0): - msg = "only support version (1,0) of file format, not %r" - raise ValueError(msg % (version,)) - shape, fortran_order, dtype = read_array_header_1_0(fp) + _check_version(version) + shape, fortran_order, dtype = _read_array_header(fp, version) if len(shape) == 0: count = 1 else: @@ -486,7 +591,7 @@ def read_array(fp): def open_memmap(filename, mode='r+', dtype=None, shape=None, - fortran_order=False, version=(1, 0)): + fortran_order=False, version=None): """ Open a .npy file as a memory-mapped array. @@ -513,9 +618,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, Whether the array should be Fortran-contiguous (True) or C-contiguous (False, the default) if we are creating a new file in "write" mode. - version : tuple of int (major, minor) + version : tuple of int (major, minor) or None If the mode is a "write" mode, then this is the version of the file - format used to create the file. Default: (1,0) + format used to create the file. + None means use the oldest supported version that is able to store the + data. Default: None Returns ------- @@ -541,9 +648,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, if 'w' in mode: # We are creating the file, not reading it. # Check if we ought to create the file. - if version != (1, 0): - msg = "only support version (1,0) of file format, not %r" - raise ValueError(msg % (version,)) + _check_version(version) # Ensure that the given dtype is an authentic dtype object rather than # just something that can be interpreted as a dtype object. dtype = numpy.dtype(dtype) @@ -558,8 +663,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, # If we got here, then it should be safe to create the file. fp = open(filename, mode+'b') try: - fp.write(magic(*version)) - write_array_header_1_0(fp, d) + used_ver = _write_array_header(fp, d, version) + # this warning can be removed when 1.9 has aged enough + if version != (2, 0) and used_ver == (2, 0): + warnings.warn("Stored array in format 2.0. It can only be" + "read by NumPy >= 1.9", UserWarning) offset = fp.tell() finally: fp.close() @@ -568,10 +676,9 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, fp = open(filename, 'rb') try: version = read_magic(fp) - if version != (1, 0): - msg = "only support version (1,0) of file format, not %r" - raise ValueError(msg % (version,)) - shape, fortran_order, dtype = read_array_header_1_0(fp) + _check_version(version) + + shape, fortran_order, dtype = _read_array_header(fp, version) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py index c294637ad..1034b5125 100644 --- a/numpy/lib/tests/test_format.py +++ b/numpy/lib/tests/test_format.py @@ -280,6 +280,7 @@ import sys import os import shutil import tempfile +import warnings from io import BytesIO import numpy as np @@ -521,19 +522,71 @@ def test_compressed_roundtrip(): assert_array_equal(arr, arr1) -def test_write_version_1_0(): +def test_version_2_0(): + f = BytesIO() + # requires more than 2 byte for header + dt = [(("%d" % i) * 100, float) for i in range(500)] + d = np.ones(1000, dtype=dt) + + format.write_array(f, d, version=(2, 0)) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', UserWarning) + format.write_array(f, d) + assert_(w[0].category is UserWarning) + + f.seek(0) + n = format.read_array(f) + assert_array_equal(d, n) + + # 1.0 requested but data cannot be saved this way + assert_raises(ValueError, format.write_array, f, d, (1, 0)) + + +def test_version_2_0_memmap(): + # requires more than 2 byte for header + dt = [(("%d" % i) * 100, float) for i in range(500)] + d = np.ones(1000, dtype=dt) + tf = tempfile.mktemp('', 'mmap', dir=tempdir) + + # 1.0 requested but data cannot be saved this way + assert_raises(ValueError, format.open_memmap, tf, mode='w+', dtype=d.dtype, + shape=d.shape, version=(1, 0)) + + ma = format.open_memmap(tf, mode='w+', dtype=d.dtype, + shape=d.shape, version=(2, 0)) + ma[...] = d + del ma + + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', UserWarning) + ma = format.open_memmap(tf, mode='w+', dtype=d.dtype, + shape=d.shape, version=None) + assert_(w[0].category is UserWarning) + ma[...] = d + del ma + + ma = format.open_memmap(tf, mode='r') + assert_array_equal(ma, d) + + +def test_write_version(): f = BytesIO() arr = np.arange(1) # These should pass. format.write_array(f, arr, version=(1, 0)) format.write_array(f, arr) + format.write_array(f, arr, version=None) + format.write_array(f, arr) + + format.write_array(f, arr, version=(2, 0)) + format.write_array(f, arr) + # These should all fail. bad_versions = [ (1, 1), (0, 0), (0, 1), - (2, 0), (2, 2), (255, 255), ] |