""" Define a simple format for saving numpy arrays to disk with the full information about them. WARNING: Due to limitations in the interpretation of structured dtypes, dtypes with fields with empty names will have the names replaced by 'f0', 'f1', etc. Such arrays will not round-trip through the format entirely accurately. The data is intact; only the field names will differ. We are working on a fix for this. This fix will not require a change in the file format. The arrays with such structures can still be saved and restored, and the correct dtype may be restored by using the `loadedarray.view(correct_dtype)` method. Format Version 1.0 ------------------ The first 6 bytes are a magic string: exactly "\\x93NUMPY". The next 1 byte is an unsigned byte: the major version number of the file format, e.g. \\x01. The next 1 byte is an unsigned byte: the minor version number of the file format, e.g. \\x00. Note: the version of the file format is not tied to the version of the numpy package. The next 2 bytes form a little-endian unsigned short int: the length of the header data HEADER_LEN. The next HEADER_LEN bytes form the header data describing the array's format. It is an ASCII string which contains a Python literal expression of a dictionary. It is terminated by a newline ('\\n') and padded with spaces ('\\x20') to make the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16 for alignment purposes. The dictionary contains three keys: "descr" : dtype.descr An object that can be passed as an argument to the numpy.dtype() constructor to create the array's dtype. "fortran_order" : bool Whether the array data is Fortran-contiguous or not. Since Fortran-contiguous arrays are a common form of non-C-contiguity, we allow them to be written directly to disk for efficiency. "shape" : tuple of int The shape of the array. For repeatability and readability, this dictionary is formatted using pprint.pformat() so the keys are in alphabetic order. This is for convenience only. A writer SHOULD implement this if possible. A reader MUST NOT depend on this. Following the header comes the array data. If the dtype contains Python objects (i.e. dtype.hasobject is True), then the data is a Python pickle of the array. Otherwise the data is the contiguous (either C- or Fortran-, depending on fortran_order) bytes of the array. Consumers can figure out the number of bytes by multiplying the number of elements given by the shape (noting that shape=() means there is 1 element) by dtype.itemsize. """ import cPickle import pprint import struct import numpy from numpy.lib.utils import safe_eval MAGIC_PREFIX = '\x93NUMPY' MAGIC_LEN = len(MAGIC_PREFIX) + 2 def magic(major, minor): """ Return the magic string for the given file format version. Parameters ---------- major : int in [0, 255] minor : int in [0, 255] Returns ------- magic : str Raises ------ ValueError if the version cannot be formatted. """ if major < 0 or major > 255: raise ValueError("major version must be 0 <= major < 256") if minor < 0 or minor > 255: raise ValueError("minor version must be 0 <= minor < 256") return '%s%s%s' % (MAGIC_PREFIX, chr(major), chr(minor)) def read_magic(fp): """ Read the magic string to get the version of the file format. Parameters ---------- fp : filelike object Returns ------- major : int minor : int """ magic_str = fp.read(MAGIC_LEN) if len(magic_str) != MAGIC_LEN: raise ValueError("could not read %d characters for the magic string; got %r" % (MAGIC_LEN, magic_str)) if magic_str[:-2] != MAGIC_PREFIX: raise ValueError("the magic string is not correct; expected %r, got %r" % (MAGIC_PREFIX, magic_str[:-2])) major, minor = map(ord, magic_str[-2:]) return major, minor def dtype_to_descr(dtype): """ Get a serializable descriptor from the dtype. The .descr attribute of a dtype object cannot be round-tripped through the dtype() constructor. Simple types, like dtype('float32'), have a descr which looks like a record array with one field with '' as a name. The dtype() constructor interprets this as a request to give a default name. Instead, we construct descriptor that can be passed to dtype(). """ if dtype.names is not None: # This is a record array. The .descr is fine. # XXX: parts of the record array with an empty name, like padding bytes, # still get fiddled with. This needs to be fixed in the C implementation # of dtype(). return dtype.descr else: return dtype.str def header_data_from_array_1_0(array): """ Get the dictionary of header metadata from a numpy.ndarray. Parameters ---------- array : numpy.ndarray Returns ------- d : dict This has the appropriate entries for writing its string representation to the header of the file. """ d = {} d['shape'] = array.shape if array.flags.c_contiguous: d['fortran_order'] = False elif array.flags.f_contiguous: d['fortran_order'] = True else: # Totally non-contiguous data. We will have to make it C-contiguous # before writing. Note that we need to test for C_CONTIGUOUS first # because a 1-D array is both C_CONTIGUOUS and F_CONTIGUOUS. d['fortran_order'] = False d['descr'] = dtype_to_descr(array.dtype) return d def write_array_header_1_0(fp, d): """ Write the header for an array using the 1.0 format. Parameters ---------- fp : filelike object d : dict This has the appropriate entries for writing its string representation to the header of the file. """ header = pprint.pformat(d) # Pad the header with spaces and a final newline such that the magic string, # the header-length short and the header are aligned on a 16-byte boundary. # Hopefully, some system, possibly memory-mapping, can take advantage of # our premature optimization. current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline topad = 16 - (current_header_len % 16) header = '%s%s\n' % (header, ' '*topad) if len(header) >= (256*256): raise ValueError("header does not fit inside %s bytes" % (256*256)) header_len_str = struct.pack('