1 files changed, 412 insertions, 0 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
new file mode 100644
index 000000000..bb58c5c61
--- /dev/null
+++ b/numpy/lib/format.py
@@ -0,0 +1,412 @@
+""" Define a simple format for saving numpy arrays to disk with the full
+information about them.
+
+WARNING: THE FORMAT IS CURRENTLY UNSTABLE. DO NOT STORE CRITICAL DATA WITH IT.
+         While this code is in an SVN branch, the format may change without
+         notice, without backwards compatibility, and without changing the
+         format's version number. When the code moves into the trunk the format
+         will be stabilized, the version number will increment as changes occur,
+         and backwards compatibility with older versions will be maintained.
+
+Format Version 1.0
+------------------
+
+The first 6 bytes are a magic string: exactly "\\x93NUMPY".
+
+The next 1 byte is an unsigned byte: the major version number of the file
+format, e.g. \\x01.
+
+The next 1 byte is an unsigned byte: the minor version number of the file
+format, e.g. \\x00. Note: the version of the file format is not tied to the
+version of the numpy package.
+
+The next 2 bytes form a little-endian unsigned short int: the length of the
+header data HEADER_LEN.
+
+The next HEADER_LEN bytes form the header data describing the array's format. It
+is an ASCII string which contains a Python literal expression of a dictionary.
+It is terminated by a newline ('\\n') and padded with spaces ('\\x20') to make
+the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16
+for alignment purposes.
+
+The dictionary contains three keys:
+
+    "descr" : dtype.descr
+        An object that can be passed as an argument to the numpy.dtype()
+        constructor to create the array's dtype.
+    "fortran_order" : bool
+        Whether the array data is Fortran-contiguous or not. Since
+        Fortran-contiguous arrays are a common form of non-C-contiguity, we
+        allow them to be written directly to disk for efficiency.
+    "shape" : tuple of int
+        The shape of the array.
+
+For repeatability and readability, this dictionary is formatted using
+pprint.pformat() so the keys are in alphabetic order.
+
+Following the header comes the array data. If the dtype contains Python objects
+(i.e. dtype.hasobject is True), then the data is a Python pickle of the array.
+Otherwise the data is the contiguous (either C- or Fortran-, depending on
+fortran_order) bytes of the array. Consumers can figure out the number of bytes
+by multiplying the number of elements given by the shape (noting that shape=()
+means there is 1 element) by dtype.itemsize.
+"""
+
+import cPickle
+import pprint
+import struct
+
+import numpy
+from numpy.lib.utils import safe_eval
+
+
+MAGIC_PREFIX = '\x93NUMPY'
+MAGIC_LEN = len(MAGIC_PREFIX) + 2
+
+def magic(major, minor):
+    """ Return the magic string for the given file format version.
+
+    Parameters
+    ----------
+    major : int in [0, 255]
+    minor : int in [0, 255]
+
+    Returns
+    -------
+    magic : str
+
+    Raises
+    ------
+    ValueError if the version cannot be formatted.
+    """
+    if major < 0 or major > 255:
+        raise ValueError("major version must be 0 <= major < 256")
+    if minor < 0 or minor > 255:
+        raise ValueError("minor version must be 0 <= minor < 256")
+    return '%s%s%s' % (MAGIC_PREFIX, chr(major), chr(minor))
+
+def read_magic(fp):
+    """ Read the magic string to get the version of the file format.
+
+    Parameters
+    ----------
+    fp : filelike object
+
+    Returns
+    -------
+    major : int
+    minor : int
+    """
+    magic_str = fp.read(MAGIC_LEN)
+    if len(magic_str) != MAGIC_LEN:
+        raise ValueError("could not read %d characters for the magic string; got %r" % (MAGIC_LEN, magic_str))
+    if magic_str[:-2] != MAGIC_PREFIX:
+        raise ValueError("the magic string is not correct; expected %r, got %r" % (MAGIC_PREFIX, magic_str[:-2]))
+    major, minor = map(ord, magic_str[-2:])
+    return major, minor
+
+def dtype_to_descr(dtype):
+    """ Get a serializable descriptor from the dtype.
+
+    The .descr attribute of a dtype object cannot be round-tripped through the
+    dtype() constructor. Simple types, like dtype('float32'), have a descr which
+    looks like a record array with one field with '' as a name. The dtype()
+    constructor interprets this as a request to give a default name. Instead, we
+    construct descriptor that can be passed to dtype().
+    """
+    if dtype.names is not None:
+        # This is a record array. The .descr is fine.
+        # XXX: parts of the record array with an empty name, like padding bytes,
+        # still get fiddled with. This needs to be fixed in the C implementation
+        # of dtype().
+        return dtype.descr
+    else:
+        return dtype.str
+
+def header_data_from_array_1_0(array):
+    """ Get the dictionary of header metadata from a numpy.ndarray.
+
+    Parameters
+    ----------
+    array : numpy.ndarray
+
+    Returns
+    -------
+    d : dict
+        This has the appropriate entries for writing its string representation
+        to the header of the file.
+    """
+    d = {}
+    d['shape'] = array.shape
+    if array.flags.c_contiguous:
+        d['fortran_order'] = False
+    elif array.flags.f_contiguous:
+        d['fortran_order'] = True
+    else:
+        # Totally non-contiguous data. We will have to make it C-contiguous
+        # before writing. Note that we need to test for C_CONTIGUOUS first
+        # because a 1-D array is both C_CONTIGUOUS and F_CONTIGUOUS.
+        d['fortran_order'] = False
+
+    d['descr'] = dtype_to_descr(array.dtype)
+    return d
+
+def write_array_header_1_0(fp, d):
+    """ Write the header for an array using the 1.0 format.
+
+    Parameters
+    ----------
+    fp : filelike object
+    d : dict
+        This has the appropriate entries for writing its string representation
+        to the header of the file.
+    """
+    header = pprint.pformat(d)
+    # Pad the header with spaces and a final newline such that the magic string,
+    # the header-length short and the header are aligned on a 16-byte boundary.
+    # Hopefully, some system, possibly memory-mapping, can take advantage of
+    # our premature optimization.
+    current_header_len = MAGIC_LEN + 2 + len(header) + 1  # 1 for the newline
+    topad = 16 - (current_header_len % 16)
+    header = '%s%s\n' % (header, ' '*topad)
+    if len(header) >= (256*256):
+        raise ValueError("header does not fit inside %s bytes" % (256*256))
+    header_len_str = struct.pack('<H', len(header))
+    fp.write(header_len_str)
+    fp.write(header)
+
+def read_array_header_1_0(fp):
+    """ Read an array header from a filelike object using the 1.0 file format
+    version.
+
+    This will leave the file object located just after the header.
+
+    Parameters
+    ----------
+    fp : filelike object
+
+    Returns
+    -------
+    shape : tuple of int
+        The shape of the array.
+    fortran_order : bool
+        The array data will be written out directly if it is either C-contiguous
+        or Fortran-contiguous. Otherwise, it will be made contiguous before
+        writing it out.
+    dtype : dtype
+
+    Raises
+    ------
+    ValueError if the data is invalid.
+    """
+    # Read an unsigned, little-endian short int which has the length of the
+    # header.
+    hlength_str = fp.read(2)
+    if len(hlength_str) != 2:
+        raise ValueError("EOF at %s before reading array header length" % fp.tell())
+    header_length = struct.unpack('<H', hlength_str)[0]
+    header = fp.read(header_length)
+    if len(header) != header_length:
+        raise ValueError("EOF at %s before reading array header" % fp.tell())
+
+    # The header is a pretty-printed string representation of a literal Python
+    # dictionary with trailing newlines padded to a 16-byte boundary. The keys
+    # are strings.
+    #   "shape" : tuple of int
+    #   "fortran_order" : bool
+    #   "descr" : dtype.descr
+    try:
+        d = safe_eval(header)
+    except SyntaxError, e:
+        raise ValueError("Cannot parse header: %r\nException: %r" % (header, e))
+    if not isinstance(d, dict):
+        raise ValueError("Header is not a dictionary: %r" % d)
+    keys = d.keys()
+    keys.sort()
+    if keys != ['descr', 'fortran_order', 'shape']:
+        raise ValueError("Header does not contain the correct keys: %r" % (keys,))
+
+    # Sanity-check the values.
+    if (not isinstance(d['shape'], tuple) or 
+        not numpy.all([isinstance(x, int) for x in d['shape']])):
+        raise ValueError("shape is not valid: %r" % (d['shape'],))
+    if not isinstance(d['fortran_order'], bool):
+        raise ValueError("fortran_order is not a valid bool: %r" % (d['fortran_order'],))
+    try:
+        dtype = numpy.dtype(d['descr'])
+    except TypeError, e:
+        raise ValueError("descr is not a valid dtype descriptor: %r" % (d['descr'],))
+
+    return d['shape'], d['fortran_order'], dtype
+
+def write_array(fp, array, version=(1,0)):
+    """ Write an array to a file, including a header.
+
+    If the array is neither C-contiguous or Fortran-contiguous AND if the
+    filelike object is not a real file object, then this function will have to
+    copy data in memory.
+
+    Parameters
+    ----------
+    fp : filelike object
+    array : numpy.ndarray
+    version : (int, int), optional
+        The version number of the format.
+
+    Raises
+    ------
+    ValueError if the array cannot be persisted.
+    Various other errors from pickling if the array contains Python objects as
+    part of its dtype.
+    """
+    if version != (1, 0):
+        raise ValueError("we only support format version (1,0), not %s" % (version,))
+    fp.write(magic(*version))
+    write_array_header_1_0(fp, header_data_from_array_1_0(array))
+    if array.dtype.hasobject:
+        # We contain Python objects so we cannot write out the data directly.
+        # Instead, we will pickle it out with version 2 of the pickle protocol.
+        cPickle.dump(array, fp, protocol=2)
+    elif array.flags.f_contiguous and not array.flags.c_contiguous:
+        # Use a suboptimal, possibly memory-intensive, but correct way to handle
+        # Fortran-contiguous arrays.
+        fp.write(array.data)
+    else:
+        if isinstance(fp, file):
+            array.tofile(fp)
+        else:
+            # XXX: We could probably chunk this using something like
+            # arrayterator.
+            fp.write(array.tostring('C'))
+
+def read_array(fp):
+    """ Read an array from a file.
+
+    Parameters
+    ----------
+    fp : filelike object
+        If this is not a real file object, then this may take extra memory and
+        time.
+
+    Returns
+    -------
+    array : numpy.ndarray
+
+    Raises
+    ------
+    ValueError if the data is invalid.
+    """
+    version = read_magic(fp)
+    if version != (1, 0):
+        raise ValueError("only support version (1,0) of file format, not %r" % (version,))
+    shape, fortran_order, dtype = read_array_header_1_0(fp)
+    if len(shape) == 0:
+        count = 1
+    else:
+        count = numpy.multiply.reduce(shape)
+
+    # Now read the actual data.
+    if dtype.hasobject:
+        # The array contained Python objects. We need to unpickle the data.
+        array = cPickle.load(fp)
+    else:
+        if isinstance(fp, file):
+            # We can use the fast fromfile() function.
+            array = numpy.fromfile(fp, dtype=dtype, count=count)
+        else:
+            # This is not a real file. We have to read it the memory-intensive way.
+            # XXX: we can probably chunk this to avoid the memory hit.
+            data = fp.read(count * dtype.itemsize)
+            array = numpy.fromstring(data, dtype=dtype, count=count)
+
+        if fortran_order:
+            array.shape = shape[::-1]
+            array = array.transpose()
+        else:
+            array.shape = shape
+
+    return array
+
+
+def open_memmap(filename, mode='r+', dtype=None, shape=None,
+    fortran_order=False, version=(1,0)):
+    """ Open a .npy file as a memory-mapped array.
+
+    Parameters
+    ----------
+    filename : str
+    mode : str, optional
+        The mode to open the file with. In addition to the standard file modes,
+        'c' is also accepted to mean "copy on write".
+    dtype : dtype, optional
+    shape : tuple of int, optional
+    fortran_order : bool, optional
+        If the mode is a "write" mode, then the file will be created using this
+        dtype, shape, and contiguity.
+    version : tuple of int (major, minor)
+        If the mode is a "write" mode, then this is the version of the file
+        format used to create the file.
+
+    Returns
+    -------
+    marray : numpy.memmap
+
+    Raises
+    ------
+    ValueError if the data or the mode is invalid.
+    IOError if the file is not found or cannot be opened correctly.
+    """
+    if 'w' in mode:
+        # We are creating the file, not reading it.
+        # Check if we ought to create the file.
+        if version != (1, 0):
+            raise ValueError("only support version (1,0) of file format, not %r" % (version,))
+        # Ensure that the given dtype is an authentic dtype object rather than
+        # just something that can be interpreted as a dtype object.
+        dtype = numpy.dtype(dtype)
+        if dtype.hasobject:
+            raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped")
+        d = dict(
+            descr=dtype_to_descr(dtype),
+            fortran_order=fortran_order,
+            shape=shape,
+        )
+        # If we got here, then it should be safe to create the file.
+        fp = open(filename, mode+'b')
+        try:
+            fp.write(magic(*version))
+            write_array_header_1_0(fp, d)
+            offset = fp.tell()
+        finally:
+            fp.close()
+    else:
+        # Read the header of the file first.
+        fp = open(filename, 'rb')
+        try:
+            version = read_magic(fp)
+            if version != (1, 0):
+                raise ValueError("only support version (1,0) of file format, not %r" % (version,))
+            shape, fortran_order, dtype = read_array_header_1_0(fp)
+            if dtype.hasobject:
+                raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped")
+            offset = fp.tell()
+        finally:
+            fp.close()
+
+    if fortran_order:
+        order = 'F'
+    else:
+        order = 'C'
+
+    # We need to change a write-only mode to a read-write mode since we've
+    # already written data to the file.
+    if mode == 'w+':
+        mode = 'r+'
+
+    marray = numpy.memmap(filename, dtype=dtype, shape=shape, order=order,
+        mode=mode, offset=offset)
+
+    return marray
+
+
+