diff options
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r-- | numpy/lib/format.py | 412 |
1 files changed, 412 insertions, 0 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py new file mode 100644 index 000000000..bb58c5c61 --- /dev/null +++ b/numpy/lib/format.py @@ -0,0 +1,412 @@ +""" Define a simple format for saving numpy arrays to disk with the full +information about them. + +WARNING: THE FORMAT IS CURRENTLY UNSTABLE. DO NOT STORE CRITICAL DATA WITH IT. + While this code is in an SVN branch, the format may change without + notice, without backwards compatibility, and without changing the + format's version number. When the code moves into the trunk the format + will be stabilized, the version number will increment as changes occur, + and backwards compatibility with older versions will be maintained. + +Format Version 1.0 +------------------ + +The first 6 bytes are a magic string: exactly "\\x93NUMPY". + +The next 1 byte is an unsigned byte: the major version number of the file +format, e.g. \\x01. + +The next 1 byte is an unsigned byte: the minor version number of the file +format, e.g. \\x00. Note: the version of the file format is not tied to the +version of the numpy package. + +The next 2 bytes form a little-endian unsigned short int: the length of the +header data HEADER_LEN. + +The next HEADER_LEN bytes form the header data describing the array's format. It +is an ASCII string which contains a Python literal expression of a dictionary. +It is terminated by a newline ('\\n') and padded with spaces ('\\x20') to make +the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16 +for alignment purposes. + +The dictionary contains three keys: + + "descr" : dtype.descr + An object that can be passed as an argument to the numpy.dtype() + constructor to create the array's dtype. + "fortran_order" : bool + Whether the array data is Fortran-contiguous or not. Since + Fortran-contiguous arrays are a common form of non-C-contiguity, we + allow them to be written directly to disk for efficiency. + "shape" : tuple of int + The shape of the array. + +For repeatability and readability, this dictionary is formatted using +pprint.pformat() so the keys are in alphabetic order. + +Following the header comes the array data. If the dtype contains Python objects +(i.e. dtype.hasobject is True), then the data is a Python pickle of the array. +Otherwise the data is the contiguous (either C- or Fortran-, depending on +fortran_order) bytes of the array. Consumers can figure out the number of bytes +by multiplying the number of elements given by the shape (noting that shape=() +means there is 1 element) by dtype.itemsize. +""" + +import cPickle +import pprint +import struct + +import numpy +from numpy.lib.utils import safe_eval + + +MAGIC_PREFIX = '\x93NUMPY' +MAGIC_LEN = len(MAGIC_PREFIX) + 2 + +def magic(major, minor): + """ Return the magic string for the given file format version. + + Parameters + ---------- + major : int in [0, 255] + minor : int in [0, 255] + + Returns + ------- + magic : str + + Raises + ------ + ValueError if the version cannot be formatted. + """ + if major < 0 or major > 255: + raise ValueError("major version must be 0 <= major < 256") + if minor < 0 or minor > 255: + raise ValueError("minor version must be 0 <= minor < 256") + return '%s%s%s' % (MAGIC_PREFIX, chr(major), chr(minor)) + +def read_magic(fp): + """ Read the magic string to get the version of the file format. + + Parameters + ---------- + fp : filelike object + + Returns + ------- + major : int + minor : int + """ + magic_str = fp.read(MAGIC_LEN) + if len(magic_str) != MAGIC_LEN: + raise ValueError("could not read %d characters for the magic string; got %r" % (MAGIC_LEN, magic_str)) + if magic_str[:-2] != MAGIC_PREFIX: + raise ValueError("the magic string is not correct; expected %r, got %r" % (MAGIC_PREFIX, magic_str[:-2])) + major, minor = map(ord, magic_str[-2:]) + return major, minor + +def dtype_to_descr(dtype): + """ Get a serializable descriptor from the dtype. + + The .descr attribute of a dtype object cannot be round-tripped through the + dtype() constructor. Simple types, like dtype('float32'), have a descr which + looks like a record array with one field with '' as a name. The dtype() + constructor interprets this as a request to give a default name. Instead, we + construct descriptor that can be passed to dtype(). + """ + if dtype.names is not None: + # This is a record array. The .descr is fine. + # XXX: parts of the record array with an empty name, like padding bytes, + # still get fiddled with. This needs to be fixed in the C implementation + # of dtype(). + return dtype.descr + else: + return dtype.str + +def header_data_from_array_1_0(array): + """ Get the dictionary of header metadata from a numpy.ndarray. + + Parameters + ---------- + array : numpy.ndarray + + Returns + ------- + d : dict + This has the appropriate entries for writing its string representation + to the header of the file. + """ + d = {} + d['shape'] = array.shape + if array.flags.c_contiguous: + d['fortran_order'] = False + elif array.flags.f_contiguous: + d['fortran_order'] = True + else: + # Totally non-contiguous data. We will have to make it C-contiguous + # before writing. Note that we need to test for C_CONTIGUOUS first + # because a 1-D array is both C_CONTIGUOUS and F_CONTIGUOUS. + d['fortran_order'] = False + + d['descr'] = dtype_to_descr(array.dtype) + return d + +def write_array_header_1_0(fp, d): + """ Write the header for an array using the 1.0 format. + + Parameters + ---------- + fp : filelike object + d : dict + This has the appropriate entries for writing its string representation + to the header of the file. + """ + header = pprint.pformat(d) + # Pad the header with spaces and a final newline such that the magic string, + # the header-length short and the header are aligned on a 16-byte boundary. + # Hopefully, some system, possibly memory-mapping, can take advantage of + # our premature optimization. + current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline + topad = 16 - (current_header_len % 16) + header = '%s%s\n' % (header, ' '*topad) + if len(header) >= (256*256): + raise ValueError("header does not fit inside %s bytes" % (256*256)) + header_len_str = struct.pack('<H', len(header)) + fp.write(header_len_str) + fp.write(header) + +def read_array_header_1_0(fp): + """ Read an array header from a filelike object using the 1.0 file format + version. + + This will leave the file object located just after the header. + + Parameters + ---------- + fp : filelike object + + Returns + ------- + shape : tuple of int + The shape of the array. + fortran_order : bool + The array data will be written out directly if it is either C-contiguous + or Fortran-contiguous. Otherwise, it will be made contiguous before + writing it out. + dtype : dtype + + Raises + ------ + ValueError if the data is invalid. + """ + # Read an unsigned, little-endian short int which has the length of the + # header. + hlength_str = fp.read(2) + if len(hlength_str) != 2: + raise ValueError("EOF at %s before reading array header length" % fp.tell()) + header_length = struct.unpack('<H', hlength_str)[0] + header = fp.read(header_length) + if len(header) != header_length: + raise ValueError("EOF at %s before reading array header" % fp.tell()) + + # The header is a pretty-printed string representation of a literal Python + # dictionary with trailing newlines padded to a 16-byte boundary. The keys + # are strings. + # "shape" : tuple of int + # "fortran_order" : bool + # "descr" : dtype.descr + try: + d = safe_eval(header) + except SyntaxError, e: + raise ValueError("Cannot parse header: %r\nException: %r" % (header, e)) + if not isinstance(d, dict): + raise ValueError("Header is not a dictionary: %r" % d) + keys = d.keys() + keys.sort() + if keys != ['descr', 'fortran_order', 'shape']: + raise ValueError("Header does not contain the correct keys: %r" % (keys,)) + + # Sanity-check the values. + if (not isinstance(d['shape'], tuple) or + not numpy.all([isinstance(x, int) for x in d['shape']])): + raise ValueError("shape is not valid: %r" % (d['shape'],)) + if not isinstance(d['fortran_order'], bool): + raise ValueError("fortran_order is not a valid bool: %r" % (d['fortran_order'],)) + try: + dtype = numpy.dtype(d['descr']) + except TypeError, e: + raise ValueError("descr is not a valid dtype descriptor: %r" % (d['descr'],)) + + return d['shape'], d['fortran_order'], dtype + +def write_array(fp, array, version=(1,0)): + """ Write an array to a file, including a header. + + If the array is neither C-contiguous or Fortran-contiguous AND if the + filelike object is not a real file object, then this function will have to + copy data in memory. + + Parameters + ---------- + fp : filelike object + array : numpy.ndarray + version : (int, int), optional + The version number of the format. + + Raises + ------ + ValueError if the array cannot be persisted. + Various other errors from pickling if the array contains Python objects as + part of its dtype. + """ + if version != (1, 0): + raise ValueError("we only support format version (1,0), not %s" % (version,)) + fp.write(magic(*version)) + write_array_header_1_0(fp, header_data_from_array_1_0(array)) + if array.dtype.hasobject: + # We contain Python objects so we cannot write out the data directly. + # Instead, we will pickle it out with version 2 of the pickle protocol. + cPickle.dump(array, fp, protocol=2) + elif array.flags.f_contiguous and not array.flags.c_contiguous: + # Use a suboptimal, possibly memory-intensive, but correct way to handle + # Fortran-contiguous arrays. + fp.write(array.data) + else: + if isinstance(fp, file): + array.tofile(fp) + else: + # XXX: We could probably chunk this using something like + # arrayterator. + fp.write(array.tostring('C')) + +def read_array(fp): + """ Read an array from a file. + + Parameters + ---------- + fp : filelike object + If this is not a real file object, then this may take extra memory and + time. + + Returns + ------- + array : numpy.ndarray + + Raises + ------ + ValueError if the data is invalid. + """ + version = read_magic(fp) + if version != (1, 0): + raise ValueError("only support version (1,0) of file format, not %r" % (version,)) + shape, fortran_order, dtype = read_array_header_1_0(fp) + if len(shape) == 0: + count = 1 + else: + count = numpy.multiply.reduce(shape) + + # Now read the actual data. + if dtype.hasobject: + # The array contained Python objects. We need to unpickle the data. + array = cPickle.load(fp) + else: + if isinstance(fp, file): + # We can use the fast fromfile() function. + array = numpy.fromfile(fp, dtype=dtype, count=count) + else: + # This is not a real file. We have to read it the memory-intensive way. + # XXX: we can probably chunk this to avoid the memory hit. + data = fp.read(count * dtype.itemsize) + array = numpy.fromstring(data, dtype=dtype, count=count) + + if fortran_order: + array.shape = shape[::-1] + array = array.transpose() + else: + array.shape = shape + + return array + + +def open_memmap(filename, mode='r+', dtype=None, shape=None, + fortran_order=False, version=(1,0)): + """ Open a .npy file as a memory-mapped array. + + Parameters + ---------- + filename : str + mode : str, optional + The mode to open the file with. In addition to the standard file modes, + 'c' is also accepted to mean "copy on write". + dtype : dtype, optional + shape : tuple of int, optional + fortran_order : bool, optional + If the mode is a "write" mode, then the file will be created using this + dtype, shape, and contiguity. + version : tuple of int (major, minor) + If the mode is a "write" mode, then this is the version of the file + format used to create the file. + + Returns + ------- + marray : numpy.memmap + + Raises + ------ + ValueError if the data or the mode is invalid. + IOError if the file is not found or cannot be opened correctly. + """ + if 'w' in mode: + # We are creating the file, not reading it. + # Check if we ought to create the file. + if version != (1, 0): + raise ValueError("only support version (1,0) of file format, not %r" % (version,)) + # Ensure that the given dtype is an authentic dtype object rather than + # just something that can be interpreted as a dtype object. + dtype = numpy.dtype(dtype) + if dtype.hasobject: + raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped") + d = dict( + descr=dtype_to_descr(dtype), + fortran_order=fortran_order, + shape=shape, + ) + # If we got here, then it should be safe to create the file. + fp = open(filename, mode+'b') + try: + fp.write(magic(*version)) + write_array_header_1_0(fp, d) + offset = fp.tell() + finally: + fp.close() + else: + # Read the header of the file first. + fp = open(filename, 'rb') + try: + version = read_magic(fp) + if version != (1, 0): + raise ValueError("only support version (1,0) of file format, not %r" % (version,)) + shape, fortran_order, dtype = read_array_header_1_0(fp) + if dtype.hasobject: + raise ValueError("the dtype includes Python objects; the array cannot be memory-mapped") + offset = fp.tell() + finally: + fp.close() + + if fortran_order: + order = 'F' + else: + order = 'C' + + # We need to change a write-only mode to a read-write mode since we've + # already written data to the file. + if mode == 'w+': + mode = 'r+' + + marray = numpy.memmap(filename, dtype=dtype, shape=shape, order=order, + mode=mode, offset=offset) + + return marray + + + |