diff options
author | Robert Kern <robert.kern@gmail.com> | 2007-12-20 07:38:28 +0000 |
---|---|---|
committer | Robert Kern <robert.kern@gmail.com> | 2007-12-20 07:38:28 +0000 |
commit | 0c728126f30c79512fdf0dff7c928c387291ce5a (patch) | |
tree | 059e87cfff37b0556a066ca14b553d1e26ab74e3 | |
parent | 7ce53484bbe4c54f3fb3f24687846e6c54ad88d5 (diff) | |
download | numpy-0c728126f30c79512fdf0dff7c928c387291ce5a.tar.gz |
* Warn potential users about format instability.
* Implement mmap capability.
-rw-r--r-- | format.py | 118 | ||||
-rw-r--r-- | tests/test_format.py | 56 |
2 files changed, 167 insertions, 7 deletions
@@ -1,6 +1,13 @@ """ Define a simple format for saving numpy arrays to disk with the full information about them. +WARNING: THE FORMAT IS CURRENTLY UNSTABLE. DO NOT STORE CRITICAL DATA WITH IT. + While this code is in an SVN branch, the format may change without + notice, without backwards compatibility, and without changing the + format's version number. When the code moves into the trunk the format + will be stabilized, the version number will increment as changes occur, + and backwards compatibility with older versions will be maintained. + Format Version 1.0 ------------------ @@ -13,8 +20,8 @@ The next 1 byte is an unsigned byte: the minor version number of the file format, e.g. \\x00. Note: the version of the file format is not tied to the version of the numpy package. -The next 2 bytes form an unsigned short int: the length of the header data -HEADER_LEN. +The next 2 bytes form a little-endian unsigned short int: the length of the +header data HEADER_LEN. The next HEADER_LEN bytes form the header data describing the array's format. It is an ASCII string which contains a Python literal expression of a dictionary. @@ -116,13 +123,18 @@ def dtype_to_descr(dtype): else: return dtype.str -def write_array_header_1_0(fp, array): - """ Write the header for an array using the 1.0 format. +def header_data_from_array_1_0(array): + """ Get the dictionary of header metadata from a numpy.ndarray. Parameters ---------- - fp : filelike object array : numpy.ndarray + + Returns + ------- + d : dict + This has the appropriate entries for writing its string representation + to the header of the file. """ d = {} d['shape'] = array.shape @@ -137,7 +149,18 @@ def write_array_header_1_0(fp, array): d['fortran_order'] = False d['descr'] = dtype_to_descr(array.dtype) + return d + +def write_array_header_1_0(fp, d): + """ Write the header for an array using the 1.0 format. + Parameters + ---------- + fp : filelike object + d : dict + This has the appropriate entries for writing its string representation + to the header of the file. + """ header = pprint.pformat(d) # Pad the header with spaces and a final newline such that the magic string, # the header-length short and the header are aligned on a 16-byte boundary. @@ -239,7 +262,7 @@ def write_array(fp, array, version=(1,0)): if version != (1, 0): raise ValueError("we only support format version (1,0), not %s" % (version,)) fp.write(magic(*version)) - write_array_header_1_0(fp, array) + write_array_header_1_0(fp, header_data_from_array_1_0(array)) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data directly. # Instead, we will pickle it out with version 2 of the pickle protocol. @@ -304,3 +327,86 @@ def read_array(fp): return array + +def open_memmap(filename, mode='r+', dtype=None, shape=None, + fortran_order=False, version=(1,0)): + """ Open a .npy file as a memory-mapped array. + + Parameters + ---------- + filename : str + mode : str, optional + The mode to open the file with. In addition to the standard file modes, + 'c' is also accepted to mean "copy on write". + dtype : dtype, optional + shape : tuple of int, optional + fortran_order : bool, optional + If the mode is a "write" mode, then the file will be created using this + dtype, shape, and contiguity. + version : tuple of int (major, minor) + If the mode is a "write" mode, then this is the version of the file + format used to create the file. + + Returns + ------- + marray : numpy.memmap + + Raises + ------ + ValueError if the data or the mode is invalid. + IOError if the file is not found or cannot be opened correctly. + """ + if 'w' in mode: + # We are creating the file, not reading it. + # Check if we ought to create the file. + if version != (1, 0): + raise ValueError("only support version (1,0) of file format, not %r" % (version,)) + # Ensure that the given dtype is an authentic dtype object rather than + # just something that can be interpreted as a dtype object. + dtype = numpy.dtype(dtype) + if dtype.hasobject: + raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped") + d = dict( + descr=dtype_to_descr(dtype), + fortran_order=fortran_order, + shape=shape, + ) + # If we got here, then it should be safe to create the file. + fp = open(filename, mode+'b') + try: + fp.write(magic(*version)) + write_array_header_1_0(fp, d) + offset = fp.tell() + finally: + fp.close() + else: + # Read the header of the file first. + fp = open(filename, 'rb') + try: + version = read_magic(fp) + if version != (1, 0): + raise ValueError("only support version (1,0) of file format, not %r" % (version,)) + shape, fortran_order, dtype = read_array_header_1_0(fp) + if dtype.hasobject: + raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped") + offset = fp.tell() + finally: + fp.close() + + if fortran_order: + order = 'F' + else: + order = 'C' + + # We need to change a write-only mode to a read-write mode since we've + # already written data to the file. + if mode == 'w+': + mode = 'r+' + + marray = numpy.memmap(filename, dtype=dtype, shape=shape, order=order, + mode=mode, offset=offset) + + return marray + + + diff --git a/tests/test_format.py b/tests/test_format.py index 064754977..b238950a3 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -276,6 +276,9 @@ Test the header writing. from cStringIO import StringIO +import os +import shutil +import tempfile from nose.tools import raises @@ -285,6 +288,20 @@ from numpy.testing import assert_array_equal from numpy.lib import format +tempdir = None + +# Module-level setup. +def setup_module(): + global tempdir + tempdir = tempfile.mkdtemp() + +def teardown_module(): + global tempdir + if tempdir is not None and os.path.isdir(tempdir): + shutil.rmtree(tempdir) + tempdir = None + + # Generate some basic arrays to test with. scalars = [ np.uint8, @@ -395,13 +412,50 @@ def roundtrip(arr): arr2 = format.read_array(f2) return arr2 +def assert_equal(o1, o2): + assert o1 == o2 + def test_roundtrip(): for arr in basic_arrays + record_arrays: - print repr(arr) arr2 = roundtrip(arr) yield assert_array_equal, arr, arr2 +def test_memmap_roundtrip(): + for arr in basic_arrays + record_arrays: + if arr.dtype.hasobject: + # Skip these since they can't be mmap'ed. + continue + # Write it out normally and through mmap. + nfn = os.path.join(tempdir, 'normal.npy') + mfn = os.path.join(tempdir, 'memmap.npy') + fp = open(nfn, 'wb') + try: + format.write_array(fp, arr) + finally: + fp.close() + + fortran_order = (arr.flags.f_contiguous and not arr.flags.c_contiguous) + ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype, + shape=arr.shape, fortran_order=fortran_order) + ma[...] = arr + del ma + + # Check that both of these files' contents are the same. + fp = open(nfn, 'rb') + normal_bytes = fp.read() + fp.close() + fp = open(mfn, 'rb') + memmap_bytes = fp.read() + fp.close() + yield assert_equal, normal_bytes, memmap_bytes + + # Check that reading the file using memmap works. + ma = format.open_memmap(nfn, mode='r') + yield assert_array_equal, ma, arr + del ma + + def test_write_version_1_0(): f = StringIO() arr = np.arange(1) |