diff options
author | Robert Kern <robert.kern@gmail.com> | 2007-12-16 07:52:44 +0000 |
---|---|---|
committer | Robert Kern <robert.kern@gmail.com> | 2007-12-16 07:52:44 +0000 |
commit | 9f25dd88311ecbbf6e861a3c9c1c265dde1d52d1 (patch) | |
tree | 01d4a3e7ac27e94f8a904c3f23612830c66f3aad | |
parent | cf73c9df97192d2215a05436bffdfca9259daf9e (diff) | |
download | numpy-9f25dd88311ecbbf6e861a3c9c1c265dde1d52d1.tar.gz |
Code establishing a simple format for .npy files.
-rw-r--r-- | format.py | 306 | ||||
-rw-r--r-- | tests/test_format.py | 458 | ||||
-rw-r--r-- | utils.py | 113 |
3 files changed, 876 insertions, 1 deletions
diff --git a/format.py b/format.py new file mode 100644 index 000000000..a2fa5bb69 --- /dev/null +++ b/format.py @@ -0,0 +1,306 @@ +""" Define a simple format for saving numpy arrays to disk with the full +information about them. + +Format Version 1.0 +------------------ + +The first 6 bytes are a magic string: exactly "\\x93NUMPY". + +The next 1 byte is an unsigned byte: the major version number of the file +format, e.g. \\x01. + +The next 1 byte is an unsigned byte: the minor version number of the file +format, e.g. \\x00. Note: the version of the file format is not tied to the +version of the numpy package. + +The next 2 bytes form an unsigned short int: the length of the header data +HEADER_LEN. + +The next HEADER_LEN bytes form the header data describing the array's format. It +is an ASCII string which contains a Python literal expression of a dictionary. +It is terminated by a newline ('\\n') and padded with spaces ('\\x20') to make +the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16 +for alignment purposes. + +The dictionary contains three keys: + + "descr" : dtype.descr + An object that can be passed as an argument to the numpy.dtype() + constructor to create the array's dtype. + "fortran_order" : bool + Whether the array data is Fortran-contiguous or not. Since + Fortran-contiguous arrays are a common form of non-C-contiguity, we + allow them to be written directly to disk for efficiency. + "shape" : tuple of int + The shape of the array. + +For repeatability and readability, this dictionary is formatted using +pprint.pformat() so the keys are in alphabetic order. + +Following the header comes the array data. If the dtype contains Python objects +(i.e. dtype.hasobject is True), then the data is a Python pickle of the array. +Otherwise the data is the contiguous (either C- or Fortran-, depending on +fortran_order) bytes of the array. Consumers can figure out the number of bytes +by multiplying the number of elements given by the shape (noting that shape=() +means there is 1 element) by dtype.itemsize. +""" + +import cPickle +import pprint +import struct + +import numpy +from numpy.lib.utils import safe_eval + + +MAGIC_PREFIX = '\x93NUMPY' +MAGIC_LEN = len(MAGIC_PREFIX) + 2 + +def magic(major, minor): + """ Return the magic string for the given file format version. + + Parameters + ---------- + major : int in [0, 255] + minor : int in [0, 255] + + Returns + ------- + magic : str + + Raises + ------ + ValueError if the version cannot be formatted. + """ + if major < 0 or major > 255: + raise ValueError("major version must be 0 <= major < 256") + if minor < 0 or minor > 255: + raise ValueError("minor version must be 0 <= minor < 256") + return '%s%s%s' % (MAGIC_PREFIX, chr(major), chr(minor)) + +def read_magic(fp): + """ Read the magic string to get the version of the file format. + + Parameters + ---------- + fp : filelike object + + Returns + ------- + major : int + minor : int + """ + magic_str = fp.read(MAGIC_LEN) + if len(magic_str) != MAGIC_LEN: + raise ValueError("could not read %d characters for the magic string; got %r" % (MAGIC_LEN, magic_str)) + if magic_str[:-2] != MAGIC_PREFIX: + raise ValueError("the magic string is not correct; expected %r, got %r" % (MAGIC_PREFIX, magic_str[:-2])) + major, minor = map(ord, magic_str[-2:]) + return major, minor + +def dtype_to_descr(dtype): + """ Get a serializable descriptor from the dtype. + + The .descr attribute of a dtype object cannot be round-tripped through the + dtype() constructor. Simple types, like dtype('float32'), have a descr which + looks like a record array with one field with '' as a name. The dtype() + constructor interprets this as a request to give a default name. Instead, we + construct descriptor that can be passed to dtype(). + """ + if dtype.names is not None: + # This is a record array. The .descr is fine. + # XXX: parts of the record array with an empty name, like padding bytes, + # still get fiddled with. This needs to be fixed in the C implementation + # of dtype(). + return dtype.descr + else: + return dtype.str + +def write_array_header_1_0(fp, array): + """ Write the header for an array using the 1.0 format. + + Parameters + ---------- + fp : filelike object + array : numpy.ndarray + """ + d = {} + d['shape'] = array.shape + if array.flags.c_contiguous: + d['fortran_order'] = False + elif array.flags.f_contiguous: + d['fortran_order'] = True + else: + # Totally non-contiguous data. We will have to make it C-contiguous + # before writing. Note that we need to test for C_CONTIGUOUS first + # because a 1-D array is both C_CONTIGUOUS and F_CONTIGUOUS. + d['fortran_order'] = False + + d['descr'] = dtype_to_descr(array.dtype) + + header = pprint.pformat(d) + # Pad the header with spaces and a final newline such that the magic string, + # the header-length short and the header are aligned on a 16-byte boundary. + # Hopefully, some system, possibly memory-mapping, can take advantage of + # our premature optimization. + current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline + topad = 16 - (current_header_len % 16) + header = '%s%s\n' % (header, ' '*topad) + if len(header) >= (256*256): + raise ValueError("header does not fit inside %s bytes" % (256*256)) + header_len_str = struct.pack('<H', len(header)) + fp.write(header_len_str) + fp.write(header) + +def read_array_header_1_0(fp): + """ Read an array header from a filelike object using the 1.0 file format + version. + + This will leave the file object located just after the header. + + Parameters + ---------- + fp : filelike object + + Returns + ------- + shape : tuple of int + The shape of the array. + fortran_order : bool + The array data will be written out directly if it is either C-contiguous + or Fortran-contiguous. Otherwise, it will be made contiguous before + writing it out. + dtype : dtype + + Raises + ------ + ValueError if the data is invalid. + """ + # Read an unsigned, little-endian short int which has the length of the + # header. + hlength_str = fp.read(2) + if len(hlength_str) != 2: + raise ValueError("EOF at %s before reading array header length" % fp.tell()) + header_length = struct.unpack('<H', hlength_str)[0] + header = fp.read(header_length) + if len(header) != header_length: + raise ValueError("EOF at %s before reading array header" % fp.tell()) + + # The header is a pretty-printed string representation of a literal Python + # dictionary with trailing newlines padded to a 16-byte boundary. The keys + # are strings. + # "shape" : tuple of int + # "fortran_order" : bool + # "descr" : dtype.descr + try: + d = safe_eval(header) + except SyntaxError, e: + raise ValueError("Cannot parse header: %r\nException: %r" % (header, e)) + if not isinstance(d, dict): + raise ValueError("Header is not a dictionary: %r" % d) + keys = d.keys() + keys.sort() + if keys != ['descr', 'fortran_order', 'shape']: + raise ValueError("Header does not contain the correct keys: %r" % (keys,)) + + # Sanity-check the values. + if (not isinstance(d['shape'], tuple) or + not numpy.all([isinstance(x, int) for x in d['shape']])): + raise ValueError("shape is not valid: %r" % (d['shape'],)) + if not isinstance(d['fortran_order'], bool): + raise ValueError("fortran_order is not a valid bool: %r" % (d['fortran_order'],)) + try: + dtype = numpy.dtype(d['descr']) + except TypeError, e: + raise ValueError("descr is not a valid dtype descriptor: %r" % (d['descr'],)) + + return d['shape'], d['fortran_order'], dtype + +def write_array(fp, array, version=(1,0)): + """ Write an array to a file, including a header. + + If the array is neither C-contiguous or Fortran-contiguous AND if the + filelike object is not a real file object, then this function will have to + copy data in memory. + + Parameters + ---------- + fp : filelike object + array : numpy.ndarray + version : (int, int), optional + The version number of the format. + + Raises + ------ + ValueError if the array cannot be persisted. + Various other errors from pickling if the array contains Python objects as + part of its dtype. + """ + if version != (1, 0): + raise ValueError("we only support format version (1,0), not %s" % (version,)) + fp.write(magic(*version)) + write_array_header_1_0(fp, array) + if array.dtype.hasobject: + # We contain Python objects so we cannot write out the data directly. + # Instead, we will pickle it out with version 2 of the pickle protocol. + cPickle.dump(array, fp, protocol=2) + elif array.flags.f_contiguous and not array.flags.c_contiguous: + # Use a suboptimal, possibly memory-intensive, but correct way to handle + # Fortran-contiguous arrays. + fp.write(array.data) + else: + if isinstance(fp, file): + array.tofile(fp) + else: + # XXX: We could probably chunk this using something like + # arrayterator. + fp.write(array.tostring('C')) + +def read_array(fp): + """ Read an array from a file. + + Parameters + ---------- + fp : filelike object + If this is not a real file object, then this may take extra memory and + time. + + Returns + ------- + array : numpy.ndarray + + Raises + ------ + ValueError if the data is invalid. + """ + version = read_magic(fp) + if version != (1, 0): + raise ValueError("only support version (1,0) of file format, not %r" % (version,)) + shape, fortran_order, dtype = read_array_header_1_0(fp) + if len(shape) == 0: + count = 1 + else: + count = numpy.multiply.reduce(shape) + + # Now read the actual data. + if dtype.hasobject: + # The array contained Python objects. We need to unpickle the data. + array = cPickle.load(fp) + else: + if isinstance(fp, file): + # We can use the fast fromfile() function. + array = numpy.fromfile(fp, dtype=dtype, count=count) + else: + # This is not a real file. We have to read it the memory-intensive way. + # XXX: we can probably chunk this to avoid the memory hit. + data = fp.read(count * dtype.itemsize) + array = numpy.fromstring(data, dtype=dtype, count=count) + + if fortran_order: + array.shape = shape[::-1] + array = array.transpose() + else: + array.shape = shape + + return array + diff --git a/tests/test_format.py b/tests/test_format.py new file mode 100644 index 000000000..064754977 --- /dev/null +++ b/tests/test_format.py @@ -0,0 +1,458 @@ +r''' Test the .npy file format. + +Set up: + + >>> import numpy as np + >>> from cStringIO import StringIO + >>> from numpy.lib import format + >>> + >>> scalars = [ + ... np.uint8, + ... np.int8, + ... np.uint16, + ... np.int16, + ... np.uint32, + ... np.int32, + ... np.uint64, + ... np.int64, + ... np.float32, + ... np.float64, + ... np.complex64, + ... np.complex128, + ... object, + ... ] + >>> + >>> basic_arrays = [] + >>> + >>> for scalar in scalars: + ... for endian in '<>': + ... dtype = np.dtype(scalar).newbyteorder(endian) + ... basic = np.arange(15).astype(dtype) + ... basic_arrays.extend([ + ... np.array([], dtype=dtype), + ... np.array(10, dtype=dtype), + ... basic, + ... basic.reshape((3,5)), + ... basic.reshape((3,5)).T, + ... basic.reshape((3,5))[::-1,::2], + ... ]) + ... + >>> + >>> Pdescr = [ + ... ('x', 'i4', (2,)), + ... ('y', 'f8', (2, 2)), + ... ('z', 'u1')] + >>> + >>> + >>> PbufferT = [ + ... ([3,2], [[6.,4.],[6.,4.]], 8), + ... ([4,3], [[7.,5.],[7.,5.]], 9), + ... ] + >>> + >>> + >>> Ndescr = [ + ... ('x', 'i4', (2,)), + ... ('Info', [ + ... ('value', 'c16'), + ... ('y2', 'f8'), + ... ('Info2', [ + ... ('name', 'S2'), + ... ('value', 'c16', (2,)), + ... ('y3', 'f8', (2,)), + ... ('z3', 'u4', (2,))]), + ... ('name', 'S2'), + ... ('z2', 'b1')]), + ... ('color', 'S2'), + ... ('info', [ + ... ('Name', 'U8'), + ... ('Value', 'c16')]), + ... ('y', 'f8', (2, 2)), + ... ('z', 'u1')] + >>> + >>> + >>> NbufferT = [ + ... ([3,2], (6j, 6., ('nn', [6j,4j], [6.,4.], [1,2]), 'NN', True), 'cc', ('NN', 6j), [[6.,4.],[6.,4.]], 8), + ... ([4,3], (7j, 7., ('oo', [7j,5j], [7.,5.], [2,1]), 'OO', False), 'dd', ('OO', 7j), [[7.,5.],[7.,5.]], 9), + ... ] + >>> + >>> + >>> record_arrays = [ + ... np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('<')), + ... np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('<')), + ... np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('>')), + ... np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('>')), + ... ] + +Test the magic string writing. + + >>> format.magic(1, 0) + '\x93NUMPY\x01\x00' + >>> format.magic(0, 0) + '\x93NUMPY\x00\x00' + >>> format.magic(255, 255) + '\x93NUMPY\xff\xff' + >>> format.magic(2, 5) + '\x93NUMPY\x02\x05' + +Test the magic string reading. + + >>> format.read_magic(StringIO(format.magic(1, 0))) + (1, 0) + >>> format.read_magic(StringIO(format.magic(0, 0))) + (0, 0) + >>> format.read_magic(StringIO(format.magic(255, 255))) + (255, 255) + >>> format.read_magic(StringIO(format.magic(2, 5))) + (2, 5) + +Test the header writing. + + >>> for arr in basic_arrays + record_arrays: + ... f = StringIO() + ... format.write_array_header_1_0(f, arr) + ... print repr(f.getvalue()) + ... + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '|u1', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '|u1', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '|u1', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '|i1', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '|i1', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '|i1', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<u2', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<u2', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>u2', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>u2', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<i2', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<i2', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>i2', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>i2', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<u4', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<u4', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>u4', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>u4', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<i4', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<i4', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>i4', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>i4', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<u8', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<u8', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>u8', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>u8', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<i8', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<i8', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>i8', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>i8', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<f4', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<f4', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>f4', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>f4', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<f8', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<f8', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>f8', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>f8', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<c8', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<c8', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>c8', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>c8', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '<c16', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '<c16', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '>c16', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '>c16', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '|O4', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (3, 3)} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (0,)} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': ()} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (15,)} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (3, 5)} \n" + "F\x00{'descr': '|O4', 'fortran_order': True, 'shape': (5, 3)} \n" + "F\x00{'descr': '|O4', 'fortran_order': False, 'shape': (3, 3)} \n" + "v\x00{'descr': [('x', '<i4', (2,)), ('y', '<f8', (2, 2)), ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)} \n" + "\x16\x02{'descr': [('x', '<i4', (2,)),\n ('Info',\n [('value', '<c16'),\n ('y2', '<f8'),\n ('Info2',\n [('name', '|S2'),\n ('value', '<c16', (2,)),\n ('y3', '<f8', (2,)),\n ('z3', '<u4', (2,))]),\n ('name', '|S2'),\n ('z2', '|b1')]),\n ('color', '|S2'),\n ('info', [('Name', '<U8'), ('Value', '<c16')]),\n ('y', '<f8', (2, 2)),\n ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)} \n" + "v\x00{'descr': [('x', '>i4', (2,)), ('y', '>f8', (2, 2)), ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)} \n" + "\x16\x02{'descr': [('x', '>i4', (2,)),\n ('Info',\n [('value', '>c16'),\n ('y2', '>f8'),\n ('Info2',\n [('name', '|S2'),\n ('value', '>c16', (2,)),\n ('y3', '>f8', (2,)),\n ('z3', '>u4', (2,))]),\n ('name', '|S2'),\n ('z2', '|b1')]),\n ('color', '|S2'),\n ('info', [('Name', '>U8'), ('Value', '>c16')]),\n ('y', '>f8', (2, 2)),\n ('z', '|u1')],\n 'fortran_order': False,\n 'shape': (2,)} \n" +''' + + +from cStringIO import StringIO + +from nose.tools import raises + +import numpy as np +from numpy.testing import assert_array_equal + +from numpy.lib import format + + +# Generate some basic arrays to test with. +scalars = [ + np.uint8, + np.int8, + np.uint16, + np.int16, + np.uint32, + np.int32, + np.uint64, + np.int64, + np.float32, + np.float64, + np.complex64, + np.complex128, + object, +] +basic_arrays = [] +for scalar in scalars: + for endian in '<>': + dtype = np.dtype(scalar).newbyteorder(endian) + basic = np.arange(15).astype(dtype) + basic_arrays.extend([ + # Empty + np.array([], dtype=dtype), + # Rank-0 + np.array(10, dtype=dtype), + # 1-D + basic, + # 2-D C-contiguous + basic.reshape((3,5)), + # 2-D F-contiguous + basic.reshape((3,5)).T, + # 2-D non-contiguous + basic.reshape((3,5))[::-1,::2], + ]) + +# More complicated record arrays. +# This is the structure of the table used for plain objects: +# +# +-+-+-+ +# |x|y|z| +# +-+-+-+ + +# Structure of a plain array description: +Pdescr = [ + ('x', 'i4', (2,)), + ('y', 'f8', (2, 2)), + ('z', 'u1')] + +# A plain list of tuples with values for testing: +PbufferT = [ + # x y z + ([3,2], [[6.,4.],[6.,4.]], 8), + ([4,3], [[7.,5.],[7.,5.]], 9), + ] + + +# This is the structure of the table used for nested objects (DON'T PANIC!): +# +# +-+---------------------------------+-----+----------+-+-+ +# |x|Info |color|info |y|z| +# | +-----+--+----------------+----+--+ +----+-----+ | | +# | |value|y2|Info2 |name|z2| |Name|Value| | | +# | | | +----+-----+--+--+ | | | | | | | +# | | | |name|value|y3|z3| | | | | | | | +# +-+-----+--+----+-----+--+--+----+--+-----+----+-----+-+-+ +# + +# The corresponding nested array description: +Ndescr = [ + ('x', 'i4', (2,)), + ('Info', [ + ('value', 'c16'), + ('y2', 'f8'), + ('Info2', [ + ('name', 'S2'), + ('value', 'c16', (2,)), + ('y3', 'f8', (2,)), + ('z3', 'u4', (2,))]), + ('name', 'S2'), + ('z2', 'b1')]), + ('color', 'S2'), + ('info', [ + ('Name', 'U8'), + ('Value', 'c16')]), + ('y', 'f8', (2, 2)), + ('z', 'u1')] + +NbufferT = [ + # x Info color info y z + # value y2 Info2 name z2 Name Value + # name value y3 z3 + ([3,2], (6j, 6., ('nn', [6j,4j], [6.,4.], [1,2]), 'NN', True), 'cc', ('NN', 6j), [[6.,4.],[6.,4.]], 8), + ([4,3], (7j, 7., ('oo', [7j,5j], [7.,5.], [2,1]), 'OO', False), 'dd', ('OO', 7j), [[7.,5.],[7.,5.]], 9), + ] + +record_arrays = [ + np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('<')), + np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('<')), + np.array(PbufferT, dtype=np.dtype(Pdescr).newbyteorder('>')), + np.array(NbufferT, dtype=np.dtype(Ndescr).newbyteorder('>')), +] + +def roundtrip(arr): + f = StringIO() + format.write_array(f, arr) + f2 = StringIO(f.getvalue()) + arr2 = format.read_array(f2) + return arr2 + + +def test_roundtrip(): + for arr in basic_arrays + record_arrays: + print repr(arr) + arr2 = roundtrip(arr) + yield assert_array_equal, arr, arr2 + +def test_write_version_1_0(): + f = StringIO() + arr = np.arange(1) + # These should pass. + format.write_array(f, arr, version=(1, 0)) + format.write_array(f, arr) + + # These should all fail. + bad_versions = [ + (1, 1), + (0, 0), + (0, 1), + (2, 0), + (2, 2), + (255, 255), + ] + for version in bad_versions: + try: + format.write_array(f, arr, version=version) + except ValueError: + pass + else: + raise AssertionError("we should have raised a ValueError for the bad version %r" % (version,)) + + +bad_version_magic = [ + '\x93NUMPY\x01\x01', + '\x93NUMPY\x00\x00', + '\x93NUMPY\x00\x01', + '\x93NUMPY\x02\x00', + '\x93NUMPY\x02\x02', + '\x93NUMPY\xff\xff', +] +malformed_magic = [ + '\x92NUMPY\x01\x00', + '\x00NUMPY\x01\x00', + '\x93numpy\x01\x00', + '\x93MATLB\x01\x00', + '\x93NUMPY\x01', + '\x93NUMPY', + '', +] + +def test_read_magic_bad_magic(): + for magic in malformed_magic: + f = StringIO(magic) + yield raises(ValueError)(format.read_magic), f + +def test_read_version_1_0_bad_magic(): + for magic in bad_version_magic + malformed_magic: + f = StringIO(magic) + yield raises(ValueError)(format.read_array), f + + @@ -1,3 +1,4 @@ +import compiler import os import sys import inspect @@ -10,7 +11,7 @@ __all__ = ['issubclass_', 'get_numpy_include', 'issubsctype', 'issubdtype', 'deprecate', 'deprecate_with_doc', 'get_numarray_include', 'get_include', 'info', 'source', 'who', - 'byte_bounds', 'may_share_memory'] + 'byte_bounds', 'may_share_memory', 'safe_eval'] def issubclass_(arg1, arg2): try: @@ -466,3 +467,113 @@ def source(object, output=sys.stdout): print >> output, inspect.getsource(object) except: print >> output, "Not available for this object." + +#----------------------------------------------------------------------------- + +# The following SafeEval class and company are adapted from Michael Spencer's +# ASPN Python Cookbook recipe: +# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/364469 +# Accordingly it is mostly Copyright 2006 by Michael Spencer. +# The recipe, like most of the other ASPN Python Cookbook recipes was made +# available under the Python license. +# http://www.python.org/license + +# It has been modified to: +# * handle unary -/+ +# * support True/False/None +# * raise SyntaxError instead of a custom exception. + +class SafeEval(object): + + def visit(self, node, **kw): + cls = node.__class__ + meth = getattr(self,'visit'+cls.__name__,self.default) + return meth(node, **kw) + + def default(self, node, **kw): + raise SyntaxError("Unsupported source construct: %s" % node.__class__) + + def visitExpression(self, node, **kw): + for child in node.getChildNodes(): + return self.visit(child, **kw) + + def visitConst(self, node, **kw): + return node.value + + def visitDict(self, node,**kw): + return dict([(self.visit(k),self.visit(v)) for k,v in node.items]) + + def visitTuple(self, node, **kw): + return tuple([self.visit(i) for i in node.nodes]) + + def visitList(self, node, **kw): + return [self.visit(i) for i in node.nodes] + + def visitUnaryAdd(self, node, **kw): + return +self.visit(node.getChildNodes()[0]) + + def visitUnarySub(self, node, **kw): + return -self.visit(node.getChildNodes()[0]) + + def visitName(self, node, **kw): + if node.name == 'False': + return False + elif node.name == 'True': + return True + elif node.name == 'None': + return None + else: + raise SyntaxError("Unknown name: %s" % node.name) + +def safe_eval(source): + """ Evaluate a string containing a Python literal expression without + allowing the execution of arbitrary non-literal code. + + Parameters + ---------- + source : str + + Returns + ------- + obj : object + + Raises + ------ + SyntaxError if the code is invalid Python expression syntax or if it + contains non-literal code. + + Examples + -------- + >>> from numpy.lib.utils import safe_eval + >>> safe_eval('1') + 1 + >>> safe_eval('[1, 2, 3]') + [1, 2, 3] + >>> safe_eval('{"foo": ("bar", 10.0)}') + {'foo': ('bar', 10.0)} + >>> safe_eval('import os') + Traceback (most recent call last): + ... + SyntaxError: invalid syntax + >>> safe_eval('open("/home/user/.ssh/id_dsa").read()') + Traceback (most recent call last): + ... + SyntaxError: Unsupported source construct: compiler.ast.CallFunc + >>> safe_eval('dict') + Traceback (most recent call last): + ... + SyntaxError: Unknown name: dict + """ + walker = SafeEval() + try: + ast = compiler.parse(source, "eval") + except SyntaxError, err: + raise + try: + return walker.visit(ast) + except SyntaxError, err: + raise + +#----------------------------------------------------------------------------- + + |