""" Define a simple format for saving numpy arrays to disk with the full information about them. WARNING: THE FORMAT IS CURRENTLY UNSTABLE. DO NOT STORE CRITICAL DATA WITH IT. While this code is in an SVN branch, the format may change without notice, without backwards compatibility, and without changing the format's version number. When the code moves into the trunk the format will be stabilized, the version number will increment as changes occur, and backwards compatibility with older versions will be maintained. Format Version 1.0 ------------------ The first 6 bytes are a magic string: exactly "\\x93NUMPY". The next 1 byte is an unsigned byte: the major version number of the file format, e.g. \\x01. The next 1 byte is an unsigned byte: the minor version number of the file format, e.g. \\x00. Note: the version of the file format is not tied to the version of the numpy package. The next 2 bytes form a little-endian unsigned short int: the length of the header data HEADER_LEN. The next HEADER_LEN bytes form the header data describing the array's format. It is an ASCII string which contains a Python literal expression of a dictionary. It is terminated by a newline ('\\n') and padded with spaces ('\\x20') to make the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16 for alignment purposes. The dictionary contains three keys: "descr" : dtype.descr An object that can be passed as an argument to the numpy.dtype() constructor to create the array's dtype. "fortran_order" : bool Whether the array data is Fortran-contiguous or not. Since Fortran-contiguous arrays are a common form of non-C-contiguity, we allow them to be written directly to disk for efficiency. "shape" : tuple of int The shape of the array. For repeatability and readability, this dictionary is formatted using pprint.pformat() so the keys are in alphabetic order. Following the header comes the array data. If the dtype contains Python objects (i.e. dtype.hasobject is True), then the data is a Python pickle of the array. Otherwise the data is the contiguous (either C- or Fortran-, depending on fortran_order) bytes of the array. Consumers can figure out the number of bytes by multiplying the number of elements given by the shape (noting that shape=() means there is 1 element) by dtype.itemsize. """ import cPickle import pprint import struct import numpy from numpy.lib.utils import safe_eval MAGIC_PREFIX = '\x93NUMPY' MAGIC_LEN = len(MAGIC_PREFIX) + 2 def magic(major, minor): """ Return the magic string for the given file format version. Parameters ---------- major : int in [0, 255] minor : int in [0, 255] Returns ------- magic : str Raises ------ ValueError if the version cannot be formatted. """ if major < 0 or major > 255: raise ValueError("major version must be 0 <= major < 256") if minor < 0 or minor > 255: raise ValueError("minor version must be 0 <= minor < 256") return '%s%s%s' % (MAGIC_PREFIX, chr(major), chr(minor)) def read_magic(fp): """ Read the magic string to get the version of the file format. Parameters ---------- fp : filelike object Returns ------- major : int minor : int """ magic_str = fp.read(MAGIC_LEN) if len(magic_str) != MAGIC_LEN: raise ValueError("could not read %d characters for the magic string; got %r" % (MAGIC_LEN, magic_str)) if magic_str[:-2] != MAGIC_PREFIX: raise ValueError("the magic string is not correct; expected %r, got %r" % (MAGIC_PREFIX, magic_str[:-2])) major, minor = map(ord, magic_str[-2:]) return major, minor def dtype_to_descr(dtype): """ Get a serializable descriptor from the dtype. The .descr attribute of a dtype object cannot be round-tripped through the dtype() constructor. Simple types, like dtype('float32'), have a descr which looks like a record array with one field with '' as a name. The dtype() constructor interprets this as a request to give a default name. Instead, we construct descriptor that can be passed to dtype(). """ if dtype.names is not None: # This is a record array. The .descr is fine. # XXX: parts of the record array with an empty name, like padding bytes, # still get fiddled with. This needs to be fixed in the C implementation # of dtype(). return dtype.descr else: return dtype.str def header_data_from_array_1_0(array): """ Get the dictionary of header metadata from a numpy.ndarray. Parameters ---------- array : numpy.ndarray Returns ------- d : dict This has the appropriate entries for writing its string representation to the header of the file. """ d = {} d['shape'] = array.shape if array.flags.c_contiguous: d['fortran_order'] = False elif array.flags.f_contiguous: d['fortran_order'] = True else: # Totally non-contiguous data. We will have to make it C-contiguous # before writing. Note that we need to test for C_CONTIGUOUS first # because a 1-D array is both C_CONTIGUOUS and F_CONTIGUOUS. d['fortran_order'] = False d['descr'] = dtype_to_descr(array.dtype) return d def write_array_header_1_0(fp, d): """ Write the header for an array using the 1.0 format. Parameters ---------- fp : filelike object d : dict This has the appropriate entries for writing its string representation to the header of the file. """ header = pprint.pformat(d) # Pad the header with spaces and a final newline such that the magic string, # the header-length short and the header are aligned on a 16-byte boundary. # Hopefully, some system, possibly memory-mapping, can take advantage of # our premature optimization. current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline topad = 16 - (current_header_len % 16) header = '%s%s\n' % (header, ' '*topad) if len(header) >= (256*256): raise ValueError("header does not fit inside %s bytes" % (256*256)) header_len_str = struct.pack('