Code establishing a simple format for .npy files.

author: Robert Kern <robert.kern@gmail.com> 2007-12-16 07:52:44 +0000
committer: Robert Kern <robert.kern@gmail.com> 2007-12-16 07:52:44 +0000
commit: 9f25dd88311ecbbf6e861a3c9c1c265dde1d52d1 (patch)
tree: 01d4a3e7ac27e94f8a904c3f23612830c66f3aad
parent: cf73c9df97192d2215a05436bffdfca9259daf9e (diff)
download: numpy-9f25dd88311ecbbf6e861a3c9c1c265dde1d52d1.tar.gz
3 files changed, 876 insertions, 1 deletions
diff --git a/format.py b/format.py
new file mode 100644
index 000000000..a2fa5bb69
--- /dev/null
+++ b/format.py
@@ -0,0 +1,306 @@
+""" Define a simple format for saving numpy arrays to disk with the full
+information about them.
+
+Format Version 1.0
+------------------
+
+The first 6 bytes are a magic string: exactly "\\x93NUMPY".
+
+The next 1 byte is an unsigned byte: the major version number of the file
+format, e.g. \\x01.
+
+The next 1 byte is an unsigned byte: the minor version number of the file
+format, e.g. \\x00. Note: the version of the file format is not tied to the
+version of the numpy package.
+
+The next 2 bytes form an unsigned short int: the length of the header data
+HEADER_LEN.
+
+The next HEADER_LEN bytes form the header data describing the array's format. It
+is an ASCII string which contains a Python literal expression of a dictionary.
+It is terminated by a newline ('\\n') and padded with spaces ('\\x20') to make
+the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16
+for alignment purposes.
+
+The dictionary contains three keys:
+
+    "descr" : dtype.descr
+        An object that can be passed as an argument to the numpy.dtype()
+        constructor to create the array's dtype.
+    "fortran_order" : bool
+        Whether the array data is Fortran-contiguous or not. Since
+        Fortran-contiguous arrays are a common form of non-C-contiguity, we
+        allow them to be written directly to disk for efficiency.
+    "shape" : tuple of int
+        The shape of the array.
+
+For repeatability and readability, this dictionary is formatted using
+pprint.pformat() so the keys are in alphabetic order.
+
+Following the header comes the array data. If the dtype contains Python objects
+(i.e. dtype.hasobject is True), then the data is a Python pickle of the array.
+Otherwise the data is the contiguous (either C- or Fortran-, depending on
+fortran_order) bytes of the array. Consumers can figure out the number of bytes
+by multiplying the number of elements given by the shape (noting that shape=()
+means there is 1 element) by dtype.itemsize.
+"""
+
+import cPickle
+import pprint
+import struct
+
+import numpy
+from numpy.lib.utils import safe_eval
+
+
+MAGIC_PREFIX = '\x93NUMPY'
+MAGIC_LEN = len(MAGIC_PREFIX) + 2
+
+def magic(major, minor):
+    """ Return the magic string for the given file format version.
+
+    Parameters
+    ----------
+    major : int in [0, 255]
+    minor : int in [0, 255]
+
+    Returns
+    -------
+    magic : str
+
+    Raises
+    ------
+    ValueError if the version cannot be formatted.
+    """
+    if major < 0 or major > 255:
+        raise ValueError("major version must be 0 <= major < 256")
+    if minor < 0 or minor > 255:
+        raise ValueError("minor version must be 0 <= minor < 256")
+    return '%s%s%s' % (MAGIC_PREFIX, chr(major), chr(minor))
+
+def read_magic(fp):
+    """ Read the magic string to get the version of the file format.
+
+    Parameters
+    ----------
+    fp : filelike object
+
+    Returns
+    -------
+    major : int
+    minor : int
+    """
+    magic_str = fp.read(MAGIC_LEN)
+    if len(magic_str) != MAGIC_LEN:
+        raise ValueError("could not read %d characters for the magic string; got %r" % (MAGIC_LEN, magic_str))
+    if magic_str[:-2] != MAGIC_PREFIX:
+        raise ValueError("the magic string is not correct; expected %r, got %r" % (MAGIC_PREFIX, magic_str[:-2]))
+    major, minor = map(ord, magic_str[-2:])
+    return major, minor
+
+def dtype_to_descr(dtype):
+    """ Get a serializable descriptor from the dtype.
+
+    The .descr attribute of a dtype object cannot be round-tripped through the
+    dtype() constructor. Simple types, like dtype('float32'), have a descr which
+    looks like a record array with one field with '' as a name. The dtype()
+    constructor interprets this as a request to give a default name. Instead, we
+    construct descriptor that can be passed to dtype().
+    """
+    if dtype.names is not None:
+        # This is a record array. The .descr is fine.
+        # XXX: parts of the record array with an empty name, like padding bytes,
+        # still get fiddled with. This needs to be fixed in the C implementation
+        # of dtype().
+        return dtype.descr
+    else:
+        return dtype.str
+
+def write_array_header_1_0(fp, array):
+    """ Write the header for an array using the 1.0 format.
+
+    Parameters
+    ----------
+    fp : filelike object
+    array : numpy.ndarray
+    """
+    d = {}
+    d['shape'] = array.shape
+    if array.flags.c_contiguous:
+        d['fortran_order'] = False
+    elif array.flags.f_contiguous:
+        d['fortran_order'] = True
+    else:
+        # Totally non-contiguous data. We will have to make it C-contiguous
+        # before writing. Note that we need to test for C_CONTIGUOUS first
+        # because a 1-D array is both C_CONTIGUOUS and F_CONTIGUOUS.
+        d['fortran_order'] = False
+
+    d['descr'] = dtype_to_descr(array.dtype)
+
+    header = pprint.pformat(d)
+    # Pad the header with spaces and a final newline such that the magic string,
+    # the header-length short and the header are aligned on a 16-byte boundary.
+    # Hopefully, some system, possibly memory-mapping, can take advantage of
+    # our premature optimization.
+    current_header_len = MAGIC_LEN + 2 + len(header) + 1  # 1 for the newline
+    topad = 16 - (current_header_len % 16)
+    header = '%s%s\n' % (header, ' '*topad)
+    if len(header) >= (256*256):
+        raise ValueError("header does not fit inside %s bytes" % (256*256))
+    header_len_str = struct.pack('<H', len(header))
+    fp.write(header_len_str)
+    fp.write(header)
+
+def read_array_header_1_0(fp):
+    """ Read an array header from a filelike object using the 1.0 file format
+    version.
+
+    This will leave the file object located just after the header.
+
+    Parameters
+    ----------
+    fp : filelike object
+
+    Returns
+    -------
+    shape : tuple of int
+        The shape of the array.
+    fortran_order : bool
+        The array data will be written out directly if it is either C-contiguous
+        or Fortran-contiguous. Otherwise, it will be made contiguous before
+        writing it out.
+    dtype : dtype
+
+    Raises
+    ------
+    ValueError if the data is invalid.
+    """
+    # Read an unsigned, little-endian short int which has the length of the
+    # header.
+    hlength_str = fp.read(2)
+    if len(hlength_str) != 2:
+        raise ValueError("EOF at %s before reading array header length" % fp.tell())
+    header_length = struct.unpack('<H', hlength_str)[0]
+    header = fp.read(header_length)
+    if len(header) != header_length:
+        raise ValueError("EOF at %s before reading array header" % fp.tell())
+
+    # The header is a pretty-printed string representation of a literal Python
+    # dictionary with trailing newlines padded to a 16-byte boundary. The keys
+    # are strings.
+    #   "shape" : tuple of int
+    #   "fortran_order" : bool
+    #   "descr" : dtype.descr
+    try:
+        d = safe_eval(header)
+    except SyntaxError, e:
+        raise ValueError("Cannot parse header: %r\nException: %r" % (header, e))
+    if not isinstance(d, dict):
+        raise ValueError("Header is not a dictionary: %r" % d)
+    keys = d.keys()
+    keys.sort()
+    if keys != ['descr', 'fortran_order', 'shape']:
+        raise ValueError("Header does not contain the correct keys: %r" % (keys,))
+
+    # Sanity-check the values.
+    if (not isinstance(d['shape'], tuple) or 
+        not numpy.all([isinstance(x, int) for x in d['shape']])):
+        raise ValueError("shape is not valid: %r" % (d['shape'],))
+    if not isinstance(d['fortran_order'], bool):
+        raise ValueError("fortran_order is not a valid bool: %r" % (d['fortran_order'],))
+    try:
+        dtype = numpy.dtype(d['descr'])
+    except TypeError, e:
+        raise ValueError("descr is not a valid dtype descriptor: %r" % (d['descr'],))
+
+    return d['shape'], d['fortran_order'], dtype
+
+def write_array(fp, array, version=(1,0)):
+    """ Write an array to a file, including a header.
+
+    If the array is neither C-contiguous or Fortran-contiguous AND if the
+    filelike object is not a real file object, then this function will have to
+    copy data in memory.
+
+    Parameters
+    ----------
+    fp : filelike object
+    array : numpy.ndarray
+    version : (int, int), optional
+        The version number of the format.
+
+    Raises
+    ------
+    ValueError if the array cannot be persisted.
+    Various other errors from pickling if the array contains Python objects as
+    part of its dtype.
+    """
+    if version != (1, 0):
+        raise ValueError("we only support format version (1,0), not %s" % (version,))
+    fp.write(magic(*version))
+    write_array_header_1_0(fp, array)
+    if array.dtype.hasobject:
+        # We contain Python objects so we cannot write out the data directly.
+        # Instead, we will pickle it out with version 2 of the pickle protocol.
+        cPickle.dump(array, fp, protocol=2)
+    elif array.flags.f_contiguous and not array.flags.c_contiguous:
+        # Use a suboptimal, possibly memory-intensive, but correct way to handle
+        # Fortran-contiguous arrays.
+        fp.write(array.data)
+    else:
+        if isinstance(fp, file):
+            array.tofile(fp)
+        else:
+            # XXX: We could probably chunk this using something like
+            # arrayterator.
+            fp.write(array.tostring('C'))
+
+def read_array(fp):
+    """ Read an array from a file.
+
+    Parameters
+    ----------
+    fp : filelike object
+        If this is not a real file object, then this may take extra memory and
+        time.
+
+    Returns
+    -------
+    array : numpy.ndarray
+
+    Raises
+    ------
+    ValueError if the data is invalid.
+    """
+    version = read_magic(fp)
+    if version != (1, 0):
+        raise ValueError("only support version (1,0) of file format, not %r" % (version,))
+    shape, fortran_order, dtype = read_array_header_1_0(fp)
+    if len(shape) == 0:
+        count = 1
+    else:
+        count = numpy.multiply.reduce(shape)
+
+    # Now read the actual data.
+    if dtype.hasobject:
+        # The array contained Python objects. We need to unpickle the data.
+        array = cPickle.load(fp)
+    else:
+        if isinstance(fp, file):
+            # We can use the fast fromfile() function.
+            array = numpy.fromfile(fp, dtype=dtype, count=count)
+        else:
+            # This is not a real file. We have to read it the memory-intensive way.
+            # XXX: we can probably chunk this to avoid the memory hit.
+            data = fp.read(count * dtype.itemsize)
+            array = numpy.fromstring(data, dtype=dtype, count=count)
+
+        if fortran_order:
+            array.shape = shape[::-1]
+            array = array.transpose()
+        else:
+            array.shape = shape
+
+    return array
+
diff --git a/tests/test_format.py b/tests/test_format.py
new file mode 100644
index 000000000..064754977
--- /dev/null
+++ b/tests/test_format.py
@@ -0,0 +1,458 @@
+r''' Test the .npy file format.
+
+Set up:
+
+    >>> import numpy as np
+    >>> from cStringIO import StringIO
+    >>> from numpy.lib import format
+    >>>
+    >>> scalars = [
+    ...     np.uint8,
+    ...     np.int8,
+    ...     np.uint16,
+    ...     np.int16,
+    ...     np.uint32,
+    ...     np.int32,
+    ...     np.uint64,
+    ...     np.int64,
+    ...     np.float32,
+    ...     np.float64,
+    ...     np.complex64,
+    ...     np.complex128,
+    ...     object,
+    ... ]
+    >>> 
+    >>> basic_arrays = []
+    >>> 
+    >>> for scalar in scalars:
+    ...     for endian in '<>':
+    ...         dtype = np.dtype(scalar).newbyteorder(endian)
+    ...         basic = np.arange(15).astype(dtype)
+    ...         basic_arrays.extend([
+    ...             np.array([], dtype=dtype),
+    ...             np.array(10, dtype=dtype),
+    ...             basic,
+    ...             basic.reshape((3,5)),
+    ...             basic.reshape((3,5)).T,
+    ...             basic.reshape((3,5))[::-1,::2],
+    ...         ])
+    ... 
+    >>> 
+    >>> Pdescr = [
+    ...     ('x', 'i4', (2,)),
+    ...     ('y', 'f8', (2, 2)),
+    ...     ('z', 'u1')]
+    >>> 
+    >>> 
+    >>> PbufferT = [
+    ...     ([3,2], [[6.,4.],[6.,4.]], 8),
+    ...     ([4,3], [[7.,5.],[7.,5.]], 9),
+    ...     ]
+    >>> 
+    >>> 
+    >>> Ndescr = [
+    ...     ('x', 'i4', (2,)),
+    ...     ('Info', [
+    ...         ('value', 'c16'),
+    ...         ('y2', 'f8'),
+    ...         ('Info2', [
+    ...             ('name', 'S2'),
+    ...             ('value', 'c16', (2,)),
+    ...             ('y3', 'f8', (2,)),
+    ...             ('z3', 'u4', (2,))]),
+    ...         ('name', 'S2'),
+    ...         ('z2', 'b1')]),
+    ...     ('color', 'S2'),
+    ...     ('info', [
+    ...         ('Name', 'U8'),
+    ...         ('Value', 'c16')]),
+    ...     ('y', 'f8', (2, 2)),
+    ...     ('z', 'u1')]
+    >>> 
+    >>> 
+    >>> NbufferT = [
+    ...     ([3,2], (6j, 6., ('nn', [6j,4j], [6.,4.], [1,2]), 'NN', True), 'cc', ('NN', 6j), [[6.,4.],[6.,4.]], 8),
+    ...     ([4,3], (7j, 7., ('oo', [7j,5j], [7.,5.], [2,1]), 'OO', False), 'dd', ('OO', 7j), [[7.,5.],[7.,5.]], 9),
+    ...     ]
+    >>> 
+    >>> 
+    >>> record_arrays = [
+    ...     np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('<')),
+    ...     np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('<')),
+    ...     np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('>')),
+    ...     np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('>')),
+    ... ]
+
+Test the magic string writing.
+
+    >>> format.magic(1, 0)
+    '\x93NUMPY\x01\x00'
+    >>> format.magic(0, 0)
+    '\x93NUMPY\x00\x00'
+    >>> format.magic(255, 255)
+    '\x93NUMPY\xff\xff'
+    >>> format.magic(2, 5)
+    '\x93NUMPY\x02\x05'
+
+Test the magic string reading.
+
+    >>> format.read_magic(StringIO(format.magic(1, 0)))
+    (1, 0)
+    >>> format.read_magic(StringIO(format.magic(0, 0)))
+    (0, 0)
+    >>> format.read_magic(StringIO(format.magic(255, 255)))
+    (255, 255)
+    >>> format.read_magic(StringIO(format.magic(2, 5)))
+    (2, 5)
+
+Test the header writing.
+
+    >>> for arr in basic_arrays + record_arrays:
+    ...     f = StringIO()
+    ...     format.write_array_header_1_0(f, arr)
+    ...     print repr(f.getvalue())
+    ... 
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|u1', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|u1', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|i1', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|i1', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<u2', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>u2', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<i2', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>i2', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<u4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>u4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<i4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>i4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<u8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>u8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<i8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>i8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<f4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>f4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<f8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>f8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '<c8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '>c8', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (0,)}             \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': ()}               \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (15,)}            \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (3, 5)}           \n"
+    "F\x00{'descr': '<c16', 'fortran_order': True, 'shape': (5, 3)}            \n"
+    "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (3, 3)}           \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (0,)}             \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': ()}               \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (15,)}            \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (3, 5)}           \n"
+    "F\x00{'descr': '>c16', 'fortran_order': True, 'shape': (5, 3)}            \n"
+    "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (3, 3)}           \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|O4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (0,)}              \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': ()}                \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (15,)}             \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (3, 5)}            \n"
+    "F\x00{'descr': '|O4', 'fortran_order': True, 'shape': (5, 3)}             \n"
+    "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (3, 3)}            \n"
+    "v\x00{'descr': [('x', '<i4', (2,)), ('y', '<f8', (2, 2)), ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)}         \n"
+    "\x16\x02{'descr': [('x', '<i4', (2,)),\n           ('Info',\n            [('value', '<c16'),\n             ('y2', '<f8'),\n             ('Info2',\n              [('name', '|S2'),\n               ('value', '<c16', (2,)),\n               ('y3', '<f8', (2,)),\n               ('z3', '<u4', (2,))]),\n             ('name', '|S2'),\n             ('z2', '|b1')]),\n           ('color', '|S2'),\n           ('info', [('Name', '<U8'), ('Value', '<c16')]),\n           ('y', '<f8', (2, 2)),\n           ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)}      \n"
+    "v\x00{'descr': [('x', '>i4', (2,)), ('y', '>f8', (2, 2)), ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)}         \n"
+    "\x16\x02{'descr': [('x', '>i4', (2,)),\n           ('Info',\n            [('value', '>c16'),\n             ('y2', '>f8'),\n             ('Info2',\n              [('name', '|S2'),\n               ('value', '>c16', (2,)),\n               ('y3', '>f8', (2,)),\n               ('z3', '>u4', (2,))]),\n             ('name', '|S2'),\n             ('z2', '|b1')]),\n           ('color', '|S2'),\n           ('info', [('Name', '>U8'), ('Value', '>c16')]),\n           ('y', '>f8', (2, 2)),\n           ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)}      \n"
+'''
+
+
+from cStringIO import StringIO
+
+from nose.tools import raises
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from numpy.lib import format
+
+
+# Generate some basic arrays to test with.
+scalars = [
+    np.uint8,
+    np.int8,
+    np.uint16,
+    np.int16,
+    np.uint32,
+    np.int32,
+    np.uint64,
+    np.int64,
+    np.float32,
+    np.float64,
+    np.complex64,
+    np.complex128,
+    object,
+]
+basic_arrays = []
+for scalar in scalars:
+    for endian in '<>':
+        dtype = np.dtype(scalar).newbyteorder(endian)
+        basic = np.arange(15).astype(dtype)
+        basic_arrays.extend([
+            # Empty
+            np.array([], dtype=dtype),
+            # Rank-0
+            np.array(10, dtype=dtype),
+            # 1-D
+            basic,
+            # 2-D C-contiguous
+            basic.reshape((3,5)),
+            # 2-D F-contiguous
+            basic.reshape((3,5)).T,
+            # 2-D non-contiguous
+            basic.reshape((3,5))[::-1,::2],
+        ])
+
+# More complicated record arrays.
+# This is the structure of the table used for plain objects:
+#
+# +-+-+-+
+# |x|y|z|
+# +-+-+-+
+
+# Structure of a plain array description:
+Pdescr = [
+    ('x', 'i4', (2,)),
+    ('y', 'f8', (2, 2)),
+    ('z', 'u1')]
+
+# A plain list of tuples with values for testing:
+PbufferT = [
+    # x     y                  z
+    ([3,2], [[6.,4.],[6.,4.]], 8),
+    ([4,3], [[7.,5.],[7.,5.]], 9),
+    ]
+
+
+# This is the structure of the table used for nested objects (DON'T PANIC!):
+#
+# +-+---------------------------------+-----+----------+-+-+
+# |x|Info                             |color|info      |y|z|
+# | +-----+--+----------------+----+--+     +----+-----+ | |
+# | |value|y2|Info2           |name|z2|     |Name|Value| | |
+# | |     |  +----+-----+--+--+    |  |     |    |     | | |
+# | |     |  |name|value|y3|z3|    |  |     |    |     | | |
+# +-+-----+--+----+-----+--+--+----+--+-----+----+-----+-+-+
+#
+
+# The corresponding nested array description:
+Ndescr = [
+    ('x', 'i4', (2,)),
+    ('Info', [
+        ('value', 'c16'),
+        ('y2', 'f8'),
+        ('Info2', [
+            ('name', 'S2'),
+            ('value', 'c16', (2,)),
+            ('y3', 'f8', (2,)),
+            ('z3', 'u4', (2,))]),
+        ('name', 'S2'),
+        ('z2', 'b1')]),
+    ('color', 'S2'),
+    ('info', [
+        ('Name', 'U8'),
+        ('Value', 'c16')]),
+    ('y', 'f8', (2, 2)),
+    ('z', 'u1')]
+
+NbufferT = [
+    # x     Info                                                color info        y                  z
+    #       value y2 Info2                            name z2         Name Value
+    #                name   value    y3       z3
+    ([3,2], (6j, 6., ('nn', [6j,4j], [6.,4.], [1,2]), 'NN', True), 'cc', ('NN', 6j), [[6.,4.],[6.,4.]], 8),
+    ([4,3], (7j, 7., ('oo', [7j,5j], [7.,5.], [2,1]), 'OO', False), 'dd', ('OO', 7j), [[7.,5.],[7.,5.]], 9),
+    ]
+
+record_arrays = [
+    np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('<')),
+    np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('<')),
+    np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('>')),
+    np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('>')),
+]
+
+def roundtrip(arr):
+    f = StringIO()
+    format.write_array(f, arr)
+    f2 = StringIO(f.getvalue())
+    arr2 = format.read_array(f2)
+    return arr2
+
+
+def test_roundtrip():
+    for arr in basic_arrays + record_arrays:
+        print repr(arr)
+        arr2 = roundtrip(arr)
+        yield assert_array_equal, arr, arr2
+
+def test_write_version_1_0():
+    f = StringIO()
+    arr = np.arange(1)
+    # These should pass.
+    format.write_array(f, arr, version=(1, 0))
+    format.write_array(f, arr)
+
+    # These should all fail.
+    bad_versions = [
+        (1, 1),
+        (0, 0),
+        (0, 1),
+        (2, 0),
+        (2, 2),
+        (255, 255),
+    ]
+    for version in bad_versions:
+        try:
+            format.write_array(f, arr, version=version)
+        except ValueError:
+            pass
+        else:
+            raise AssertionError("we should have raised a ValueError for the bad version %r" % (version,))
+
+
+bad_version_magic = [
+    '\x93NUMPY\x01\x01',
+    '\x93NUMPY\x00\x00',
+    '\x93NUMPY\x00\x01',
+    '\x93NUMPY\x02\x00',
+    '\x93NUMPY\x02\x02',
+    '\x93NUMPY\xff\xff',
+]
+malformed_magic = [
+    '\x92NUMPY\x01\x00',
+    '\x00NUMPY\x01\x00',
+    '\x93numpy\x01\x00',
+    '\x93MATLB\x01\x00',
+    '\x93NUMPY\x01',
+    '\x93NUMPY',
+    '',
+]
+
+def test_read_magic_bad_magic():
+    for magic in malformed_magic:
+        f = StringIO(magic)
+        yield raises(ValueError)(format.read_magic), f
+
+def test_read_version_1_0_bad_magic():
+    for magic in bad_version_magic + malformed_magic:
+        f = StringIO(magic)
+        yield raises(ValueError)(format.read_array), f
+
+
diff --git a/utils.py b/utils.py
index 19a10f518..048ffafc0 100644
--- a/utils.py
+++ b/utils.py
@@ -1,3 +1,4 @@
+import compiler
 import os
 import sys
 import inspect
@@ -10,7 +11,7 @@ __all__ = ['issubclass_', 'get_numpy_include', 'issubsctype',
            'issubdtype', 'deprecate', 'deprecate_with_doc',
            'get_numarray_include',
            'get_include', 'info', 'source', 'who',
-           'byte_bounds', 'may_share_memory']
+           'byte_bounds', 'may_share_memory', 'safe_eval']
 
 def issubclass_(arg1, arg2):
     try:
@@ -466,3 +467,113 @@ def source(object, output=sys.stdout):
         print >> output,  inspect.getsource(object)
     except:
         print >> output,  "Not available for this object."
+
+#-----------------------------------------------------------------------------
+
+# The following SafeEval class and company are adapted from Michael Spencer's
+# ASPN Python Cookbook recipe:
+#   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/364469
+# Accordingly it is mostly Copyright 2006 by Michael Spencer.
+# The recipe, like most of the other ASPN Python Cookbook recipes was made
+# available under the Python license.
+#   http://www.python.org/license
+
+# It has been modified to:
+#   * handle unary -/+
+#   * support True/False/None
+#   * raise SyntaxError instead of a custom exception.
+
+class SafeEval(object):
+    
+    def visit(self, node, **kw):
+        cls = node.__class__
+        meth = getattr(self,'visit'+cls.__name__,self.default)
+        return meth(node, **kw)
+            
+    def default(self, node, **kw):
+        raise SyntaxError("Unsupported source construct: %s" % node.__class__)
+            
+    def visitExpression(self, node, **kw):
+        for child in node.getChildNodes():
+            return self.visit(child, **kw)
+    
+    def visitConst(self, node, **kw):
+        return node.value
+
+    def visitDict(self, node,**kw):
+        return dict([(self.visit(k),self.visit(v)) for k,v in node.items])
+        
+    def visitTuple(self, node, **kw):
+        return tuple([self.visit(i) for i in node.nodes])
+        
+    def visitList(self, node, **kw):
+        return [self.visit(i) for i in node.nodes]
+
+    def visitUnaryAdd(self, node, **kw):
+        return +self.visit(node.getChildNodes()[0])
+
+    def visitUnarySub(self, node, **kw):
+        return -self.visit(node.getChildNodes()[0])
+
+    def visitName(self, node, **kw):
+        if node.name == 'False':
+            return False
+        elif node.name == 'True':
+            return True
+        elif node.name == 'None':
+            return None
+        else:
+            raise SyntaxError("Unknown name: %s" % node.name)
+
+def safe_eval(source):
+    """ Evaluate a string containing a Python literal expression without
+    allowing the execution of arbitrary non-literal code.
+
+    Parameters
+    ----------
+    source : str
+
+    Returns
+    -------
+    obj : object
+
+    Raises
+    ------
+    SyntaxError if the code is invalid Python expression syntax or if it
+    contains non-literal code.
+
+    Examples
+    --------
+    >>> from numpy.lib.utils import safe_eval
+    >>> safe_eval('1')
+    1
+    >>> safe_eval('[1, 2, 3]')
+    [1, 2, 3]
+    >>> safe_eval('{"foo": ("bar", 10.0)}')
+    {'foo': ('bar', 10.0)}
+    >>> safe_eval('import os')
+    Traceback (most recent call last):
+      ...
+    SyntaxError: invalid syntax
+    >>> safe_eval('open("/home/user/.ssh/id_dsa").read()')
+    Traceback (most recent call last):
+      ...
+    SyntaxError: Unsupported source construct: compiler.ast.CallFunc
+    >>> safe_eval('dict')
+    Traceback (most recent call last):
+      ...
+    SyntaxError: Unknown name: dict
+    """
+    walker = SafeEval()
+    try:
+        ast = compiler.parse(source, "eval")
+    except SyntaxError, err:
+        raise
+    try:
+        return walker.visit(ast)
+    except SyntaxError, err:
+        raise
+
+#-----------------------------------------------------------------------------
+
+
author	Robert Kern <robert.kern@gmail.com>	2007-12-16 07:52:44 +0000
committer	Robert Kern <robert.kern@gmail.com>	2007-12-16 07:52:44 +0000
commit	9f25dd88311ecbbf6e861a3c9c1c265dde1d52d1 (patch)
tree	01d4a3e7ac27e94f8a904c3f23612830c66f3aad
parent	cf73c9df97192d2215a05436bffdfca9259daf9e (diff)
download	numpy-9f25dd88311ecbbf6e861a3c9c1c265dde1d52d1.tar.gz