* Warn potential users about format instability.

* Implement mmap capability.
author: Robert Kern <robert.kern@gmail.com> 2007-12-20 07:38:28 +0000
committer: Robert Kern <robert.kern@gmail.com> 2007-12-20 07:38:28 +0000
commit: 0c728126f30c79512fdf0dff7c928c387291ce5a (patch)
tree: 059e87cfff37b0556a066ca14b553d1e26ab74e3
parent: 7ce53484bbe4c54f3fb3f24687846e6c54ad88d5 (diff)
download: numpy-0c728126f30c79512fdf0dff7c928c387291ce5a.tar.gz
2 files changed, 167 insertions, 7 deletions
diff --git a/format.py b/format.py
index a2fa5bb69..bb58c5c61 100644
--- a/format.py
+++ b/format.py
@@ -1,6 +1,13 @@
 """ Define a simple format for saving numpy arrays to disk with the full
 information about them.
 
+WARNING: THE FORMAT IS CURRENTLY UNSTABLE. DO NOT STORE CRITICAL DATA WITH IT.
+         While this code is in an SVN branch, the format may change without
+         notice, without backwards compatibility, and without changing the
+         format's version number. When the code moves into the trunk the format
+         will be stabilized, the version number will increment as changes occur,
+         and backwards compatibility with older versions will be maintained.
+
 Format Version 1.0
 ------------------
 
@@ -13,8 +20,8 @@ The next 1 byte is an unsigned byte: the minor version number of the file
 format, e.g. \\x00. Note: the version of the file format is not tied to the
 version of the numpy package.
 
-The next 2 bytes form an unsigned short int: the length of the header data
-HEADER_LEN.
+The next 2 bytes form a little-endian unsigned short int: the length of the
+header data HEADER_LEN.
 
 The next HEADER_LEN bytes form the header data describing the array's format. It
 is an ASCII string which contains a Python literal expression of a dictionary.
@@ -116,13 +123,18 @@ def dtype_to_descr(dtype):
     else:
         return dtype.str
 
-def write_array_header_1_0(fp, array):
-    """ Write the header for an array using the 1.0 format.
+def header_data_from_array_1_0(array):
+    """ Get the dictionary of header metadata from a numpy.ndarray.
 
     Parameters
     ----------
-    fp : filelike object
     array : numpy.ndarray
+
+    Returns
+    -------
+    d : dict
+        This has the appropriate entries for writing its string representation
+        to the header of the file.
     """
     d = {}
     d['shape'] = array.shape
@@ -137,7 +149,18 @@ def write_array_header_1_0(fp, array):
         d['fortran_order'] = False
 
     d['descr'] = dtype_to_descr(array.dtype)
+    return d
+
+def write_array_header_1_0(fp, d):
+    """ Write the header for an array using the 1.0 format.
 
+    Parameters
+    ----------
+    fp : filelike object
+    d : dict
+        This has the appropriate entries for writing its string representation
+        to the header of the file.
+    """
     header = pprint.pformat(d)
     # Pad the header with spaces and a final newline such that the magic string,
     # the header-length short and the header are aligned on a 16-byte boundary.
@@ -239,7 +262,7 @@ def write_array(fp, array, version=(1,0)):
     if version != (1, 0):
         raise ValueError("we only support format version (1,0), not %s" % (version,))
     fp.write(magic(*version))
-    write_array_header_1_0(fp, array)
+    write_array_header_1_0(fp, header_data_from_array_1_0(array))
     if array.dtype.hasobject:
         # We contain Python objects so we cannot write out the data directly.
         # Instead, we will pickle it out with version 2 of the pickle protocol.
@@ -304,3 +327,86 @@ def read_array(fp):
 
     return array
 
+
+def open_memmap(filename, mode='r+', dtype=None, shape=None,
+    fortran_order=False, version=(1,0)):
+    """ Open a .npy file as a memory-mapped array.
+
+    Parameters
+    ----------
+    filename : str
+    mode : str, optional
+        The mode to open the file with. In addition to the standard file modes,
+        'c' is also accepted to mean "copy on write".
+    dtype : dtype, optional
+    shape : tuple of int, optional
+    fortran_order : bool, optional
+        If the mode is a "write" mode, then the file will be created using this
+        dtype, shape, and contiguity.
+    version : tuple of int (major, minor)
+        If the mode is a "write" mode, then this is the version of the file
+        format used to create the file.
+
+    Returns
+    -------
+    marray : numpy.memmap
+
+    Raises
+    ------
+    ValueError if the data or the mode is invalid.
+    IOError if the file is not found or cannot be opened correctly.
+    """
+    if 'w' in mode:
+        # We are creating the file, not reading it.
+        # Check if we ought to create the file.
+        if version != (1, 0):
+            raise ValueError("only support version (1,0) of file format, not %r" % (version,))
+        # Ensure that the given dtype is an authentic dtype object rather than
+        # just something that can be interpreted as a dtype object.
+        dtype = numpy.dtype(dtype)
+        if dtype.hasobject:
+            raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped")
+        d = dict(
+            descr=dtype_to_descr(dtype),
+            fortran_order=fortran_order,
+            shape=shape,
+        )
+        # If we got here, then it should be safe to create the file.
+        fp = open(filename, mode+'b')
+        try:
+            fp.write(magic(*version))
+            write_array_header_1_0(fp, d)
+            offset = fp.tell()
+        finally:
+            fp.close()
+    else:
+        # Read the header of the file first.
+        fp = open(filename, 'rb')
+        try:
+            version = read_magic(fp)
+            if version != (1, 0):
+                raise ValueError("only support version (1,0) of file format, not %r" % (version,))
+            shape, fortran_order, dtype = read_array_header_1_0(fp)
+            if dtype.hasobject:
+                raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped")
+            offset = fp.tell()
+        finally:
+            fp.close()
+
+    if fortran_order:
+        order = 'F'
+    else:
+        order = 'C'
+
+    # We need to change a write-only mode to a read-write mode since we've
+    # already written data to the file.
+    if mode == 'w+':
+        mode = 'r+'
+
+    marray = numpy.memmap(filename, dtype=dtype, shape=shape, order=order,
+        mode=mode, offset=offset)
+
+    return marray
+
+
+
diff --git a/tests/test_format.py b/tests/test_format.py
index 064754977..b238950a3 100644
--- a/tests/test_format.py
+++ b/tests/test_format.py
@@ -276,6 +276,9 @@ Test the header writing.
 
 
 from cStringIO import StringIO
+import os
+import shutil
+import tempfile
 
 from nose.tools import raises
 
@@ -285,6 +288,20 @@ from numpy.testing import assert_array_equal
 from numpy.lib import format
 
 
+tempdir = None
+
+# Module-level setup.
+def setup_module():
+    global tempdir
+    tempdir = tempfile.mkdtemp()
+
+def teardown_module():
+    global tempdir
+    if tempdir is not None and os.path.isdir(tempdir):
+        shutil.rmtree(tempdir)
+        tempdir = None
+
+
 # Generate some basic arrays to test with.
 scalars = [
     np.uint8,
@@ -395,13 +412,50 @@ def roundtrip(arr):
     arr2 = format.read_array(f2)
     return arr2
 
+def assert_equal(o1, o2):
+    assert o1 == o2
+
 
 def test_roundtrip():
     for arr in basic_arrays + record_arrays:
-        print repr(arr)
         arr2 = roundtrip(arr)
         yield assert_array_equal, arr, arr2
 
+def test_memmap_roundtrip():
+    for arr in basic_arrays + record_arrays:
+        if arr.dtype.hasobject:
+            # Skip these since they can't be mmap'ed.
+            continue
+        # Write it out normally and through mmap.
+        nfn = os.path.join(tempdir, 'normal.npy')
+        mfn = os.path.join(tempdir, 'memmap.npy')
+        fp = open(nfn, 'wb')
+        try:
+            format.write_array(fp, arr)
+        finally:
+            fp.close()
+
+        fortran_order = (arr.flags.f_contiguous and not arr.flags.c_contiguous)
+        ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype,
+            shape=arr.shape, fortran_order=fortran_order)
+        ma[...] = arr
+        del ma
+
+        # Check that both of these files' contents are the same.
+        fp = open(nfn, 'rb')
+        normal_bytes = fp.read()
+        fp.close()
+        fp = open(mfn, 'rb')
+        memmap_bytes = fp.read()
+        fp.close()
+        yield assert_equal, normal_bytes, memmap_bytes
+
+        # Check that reading the file using memmap works.
+        ma = format.open_memmap(nfn, mode='r')
+        yield assert_array_equal, ma, arr
+        del ma
+
+
 def test_write_version_1_0():
     f = StringIO()
     arr = np.arange(1)
author	Robert Kern <robert.kern@gmail.com>	2007-12-20 07:38:28 +0000
committer	Robert Kern <robert.kern@gmail.com>	2007-12-20 07:38:28 +0000
commit	0c728126f30c79512fdf0dff7c928c387291ce5a (patch)
tree	059e87cfff37b0556a066ca14b553d1e26ab74e3
parent	7ce53484bbe4c54f3fb3f24687846e6c54ad88d5 (diff)
download	numpy-0c728126f30c79512fdf0dff7c928c387291ce5a.tar.gz