diff options
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r-- | numpy/lib/format.py | 175 |
1 files changed, 141 insertions, 34 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 4cfbbe05d..6083312de 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -139,6 +139,7 @@ from __future__ import division, absolute_import, print_function import numpy import sys import io +import warnings from numpy.lib.utils import safe_eval from numpy.compat import asbytes, isfileobj, long, basestring @@ -151,6 +152,14 @@ MAGIC_PREFIX = asbytes('\x93NUMPY') MAGIC_LEN = len(MAGIC_PREFIX) + 2 BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes +# difference between version 1.0 and 2.0 is a 4 byte (I) header length +# instead of 2 bytes (H) allowing storage of large structured arrays + +def _check_version(version): + if version not in [(1, 0), (2, 0), None]: + msg = "we only support format version (1,0) and (2, 0), not %s" + raise ValueError(msg % (version,)) + def magic(major, minor): """ Return the magic string for the given file format version. @@ -258,8 +267,8 @@ def header_data_from_array_1_0(array): d['descr'] = dtype_to_descr(array.dtype) return d -def write_array_header_1_0(fp, d): - """ Write the header for an array using the 1.0 format. +def _write_array_header(fp, d, version=None): + """ Write the header for an array and returns the version used Parameters ---------- @@ -267,6 +276,14 @@ def write_array_header_1_0(fp, d): d : dict This has the appropriate entries for writing its string representation to the header of the file. + version: tuple or None + None means use oldest that works + explicit version will raise a ValueError if the format does not + allow saving this data. Default: None + Returns + ------- + version : tuple of int + the file version which needs to be used to store the data """ import struct header = ["{"] @@ -282,11 +299,52 @@ def write_array_header_1_0(fp, d): current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline topad = 16 - (current_header_len % 16) header = asbytes(header + ' '*topad + '\n') - if len(header) >= (256*256): - raise ValueError("header does not fit inside %s bytes" % (256*256)) - header_len_str = struct.pack('<H', len(header)) + + if len(header) >= (256*256) and version == (1, 0): + raise ValueError("header does not fit inside %s bytes required by the" + " 1.0 format" % (256*256)) + if len(header) < (256*256): + header_len_str = struct.pack('<H', len(header)) + version = (1, 0) + elif len(header) < (2**32): + header_len_str = struct.pack('<I', len(header)) + version = (2, 0) + else: + raise ValueError("header does not fit inside 4 GiB required by " + "the 2.0 format") + + fp.write(magic(*version)) fp.write(header_len_str) fp.write(header) + return version + +def write_array_header_1_0(fp, d): + """ Write the header for an array using the 1.0 format. + + Parameters + ---------- + fp : filelike object + d : dict + This has the appropriate entries for writing its string representation + to the header of the file. + """ + _write_array_header(fp, d, (1, 0)) + + +def write_array_header_2_0(fp, d): + """ Write the header for an array using the 2.0 format. + The 2.0 format allows storing very large structured arrays. + + .. versionadded:: 1.9.0 + + Parameters + ---------- + fp : filelike object + d : dict + This has the appropriate entries for writing its string representation + to the header of the file. + """ + _write_array_header(fp, d, (2, 0)) def read_array_header_1_0(fp): """ @@ -317,12 +375,58 @@ def read_array_header_1_0(fp): If the data is invalid. """ + _read_array_header(fp, version=(1, 0)) + +def read_array_header_2_0(fp): + """ + Read an array header from a filelike object using the 2.0 file format + version. + + This will leave the file object located just after the header. + + .. versionadded:: 1.9.0 + + Parameters + ---------- + fp : filelike object + A file object or something with a `.read()` method like a file. + + Returns + ------- + shape : tuple of int + The shape of the array. + fortran_order : bool + The array data will be written out directly if it is either C-contiguous + or Fortran-contiguous. Otherwise, it will be made contiguous before + writing it out. + dtype : dtype + The dtype of the file's data. + + Raises + ------ + ValueError + If the data is invalid. + + """ + _read_array_header(fp, version=(2, 0)) + +def _read_array_header(fp, version): + """ + see read_array_header_1_0 + """ # Read an unsigned, little-endian short int which has the length of the # header. import struct - hlength_str = _read_bytes(fp, 2, "array header length") - header_length = struct.unpack('<H', hlength_str)[0] - header = _read_bytes(fp, header_length, "array header") + if version == (1, 0): + hlength_str = _read_bytes(fp, 2, "array header length") + header_length = struct.unpack('<H', hlength_str)[0] + header = _read_bytes(fp, header_length, "array header") + elif version == (2, 0): + hlength_str = _read_bytes(fp, 4, "array header length") + header_length = struct.unpack('<I', hlength_str)[0] + header = _read_bytes(fp, header_length, "array header") + else: + raise ValueError("Invalid version %r" % version) # The header is a pretty-printed string representation of a literal Python # dictionary with trailing newlines padded to a 16-byte boundary. The keys @@ -359,7 +463,7 @@ def read_array_header_1_0(fp): return d['shape'], d['fortran_order'], dtype -def write_array(fp, array, version=(1, 0)): +def write_array(fp, array, version=None): """ Write an array to an NPY file, including a header. @@ -374,8 +478,9 @@ def write_array(fp, array, version=(1, 0)): method. array : ndarray The array to write to disk. - version : (int, int), optional - The version number of the format. Default: (1, 0) + version : (int, int) or None, optional + The version number of the format. None means use the oldest supported + version that is able to store the data. Default: None Raises ------ @@ -387,11 +492,13 @@ def write_array(fp, array, version=(1, 0)): are not picklable. """ - if version != (1, 0): - msg = "we only support format version (1,0), not %s" - raise ValueError(msg % (version,)) - fp.write(magic(*version)) - write_array_header_1_0(fp, header_data_from_array_1_0(array)) + _check_version(version) + used_ver = _write_array_header(fp, header_data_from_array_1_0(array), + version) + # this warning can be removed when 1.9 has aged enough + if version != (2, 0) and used_ver == (2, 0): + warnings.warn("Stored array in format 2.0. It can only be" + "read by NumPy >= 1.9", UserWarning) # Set buffer size to 16 MiB to hide the Python loop overhead. buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) @@ -407,7 +514,7 @@ def write_array(fp, array, version=(1, 0)): for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='F'): - fp.write(chunk.tostring('C')) + fp.write(chunk.tobytes('C')) else: if isfileobj(fp): array.tofile(fp) @@ -415,7 +522,7 @@ def write_array(fp, array, version=(1, 0)): for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='C'): - fp.write(chunk.tostring('C')) + fp.write(chunk.tobytes('C')) def read_array(fp): @@ -440,10 +547,8 @@ def read_array(fp): """ version = read_magic(fp) - if version != (1, 0): - msg = "only support version (1,0) of file format, not %r" - raise ValueError(msg % (version,)) - shape, fortran_order, dtype = read_array_header_1_0(fp) + _check_version(version) + shape, fortran_order, dtype = _read_array_header(fp, version) if len(shape) == 0: count = 1 else: @@ -486,7 +591,7 @@ def read_array(fp): def open_memmap(filename, mode='r+', dtype=None, shape=None, - fortran_order=False, version=(1, 0)): + fortran_order=False, version=None): """ Open a .npy file as a memory-mapped array. @@ -513,9 +618,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, Whether the array should be Fortran-contiguous (True) or C-contiguous (False, the default) if we are creating a new file in "write" mode. - version : tuple of int (major, minor) + version : tuple of int (major, minor) or None If the mode is a "write" mode, then this is the version of the file - format used to create the file. Default: (1,0) + format used to create the file. + None means use the oldest supported version that is able to store the + data. Default: None Returns ------- @@ -541,9 +648,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, if 'w' in mode: # We are creating the file, not reading it. # Check if we ought to create the file. - if version != (1, 0): - msg = "only support version (1,0) of file format, not %r" - raise ValueError(msg % (version,)) + _check_version(version) # Ensure that the given dtype is an authentic dtype object rather than # just something that can be interpreted as a dtype object. dtype = numpy.dtype(dtype) @@ -558,8 +663,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, # If we got here, then it should be safe to create the file. fp = open(filename, mode+'b') try: - fp.write(magic(*version)) - write_array_header_1_0(fp, d) + used_ver = _write_array_header(fp, d, version) + # this warning can be removed when 1.9 has aged enough + if version != (2, 0) and used_ver == (2, 0): + warnings.warn("Stored array in format 2.0. It can only be" + "read by NumPy >= 1.9", UserWarning) offset = fp.tell() finally: fp.close() @@ -568,10 +676,9 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, fp = open(filename, 'rb') try: version = read_magic(fp) - if version != (1, 0): - msg = "only support version (1,0) of file format, not %r" - raise ValueError(msg % (version,)) - shape, fortran_order, dtype = read_array_header_1_0(fp) + _check_version(version) + + shape, fortran_order, dtype = _read_array_header(fp, version) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) |