diff options
author | Eric Wieser <wieser.eric@gmail.com> | 2019-04-16 01:32:35 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-04-16 01:32:35 -0700 |
commit | 9af2340580bcbacc06b1079df3e9b8abf90b7657 (patch) | |
tree | dd8041d48e8cd9b3cc5ddcdab9e0ba851a0b4a9a /numpy/lib/format.py | |
parent | 389bd44e32b0eace0d024b126931a0a00d14cffe (diff) | |
parent | cc94f360febdef0e6c4183c50555ba82e60ccff6 (diff) | |
download | numpy-9af2340580bcbacc06b1079df3e9b8abf90b7657.tar.gz |
Merge branch 'master' into poly1d-fixes-fixes-fixes-fixes
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r-- | numpy/lib/format.py | 158 |
1 files changed, 105 insertions, 53 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 633aee675..4da1022ca 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -1,5 +1,10 @@ """ -Define a simple format for saving numpy arrays to disk with the full +Binary serialization + +NPY format +========== + +A simple format for saving numpy arrays to disk with the full information about them. The ``.npy`` format is the standard binary file format in NumPy for @@ -100,9 +105,9 @@ the header data HEADER_LEN. The next HEADER_LEN bytes form the header data describing the array's format. It is an ASCII string which contains a Python literal expression of a dictionary. It is terminated by a newline (``\\n``) and padded with -spaces (``\\x20``) to make the total length of -``magic string + 4 + HEADER_LEN`` be evenly divisible by 16 for alignment -purposes. +spaces (``\\x20``) to make the total of +``len(magic string) + 2 + len(length) + HEADER_LEN`` be evenly divisible +by 64 for alignment purposes. The dictionary contains three keys: @@ -143,8 +148,10 @@ data HEADER_LEN." Notes ----- -The ``.npy`` format, including reasons for creating it and a comparison of -alternatives, is described fully in the "npy-format" NEP. +The ``.npy`` format, including motivation for creating it and a comparison of +alternatives, is described in the `"npy-format" NEP +<https://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have +evolved with time and this document is more current. """ from __future__ import division, absolute_import, print_function @@ -154,15 +161,14 @@ import sys import io import warnings from numpy.lib.utils import safe_eval -from numpy.compat import asbytes, asstr, isfileobj, long, basestring +from numpy.compat import ( + asbytes, asstr, isfileobj, long, os_fspath, pickle + ) -if sys.version_info[0] >= 3: - import pickle -else: - import cPickle as pickle -MAGIC_PREFIX = asbytes('\x93NUMPY') +MAGIC_PREFIX = b'\x93NUMPY' MAGIC_LEN = len(MAGIC_PREFIX) + 2 +ARRAY_ALIGN = 64 # plausible values are powers of 2 between 16 and 4096 BUFFER_SIZE = 2**18 # size of buffer for reading npz files in bytes # difference between version 1.0 and 2.0 is a 4 byte (I) header length @@ -252,6 +258,43 @@ def dtype_to_descr(dtype): else: return dtype.str +def descr_to_dtype(descr): + ''' + descr may be stored as dtype.descr, which is a list of + (name, format, [shape]) tuples. Offsets are not explicitly saved, rather + empty fields with name,format == '', '|Vn' are added as padding. + + This function reverses the process, eliminating the empty padding fields. + ''' + if isinstance(descr, (str, dict)): + # No padding removal needed + return numpy.dtype(descr) + + fields = [] + offset = 0 + for field in descr: + if len(field) == 2: + name, descr_str = field + dt = descr_to_dtype(descr_str) + else: + name, descr_str, shape = field + dt = numpy.dtype((descr_to_dtype(descr_str), shape)) + + # Ignore padding bytes, which will be void bytes with '' as name + # Once support for blank names is removed, only "if name == ''" needed) + is_pad = (name == '' and dt.type is numpy.void and dt.names is None) + if not is_pad: + fields.append((name, dt, offset)) + + offset += dt.itemsize + + names, formats, offsets = zip(*fields) + # names may be (title, names) tuples + nametups = (n if isinstance(n, tuple) else (None, n) for n in names) + titles, names = zip(*nametups) + return numpy.dtype({'names': names, 'formats': formats, 'titles': titles, + 'offsets': offsets, 'itemsize': offset}) + def header_data_from_array_1_0(array): """ Get the dictionary of header metadata from a numpy.ndarray. @@ -304,27 +347,33 @@ def _write_array_header(fp, d, version=None): header.append("'%s': %s, " % (key, repr(value))) header.append("}") header = "".join(header) - # Pad the header with spaces and a final newline such that the magic - # string, the header-length short and the header are aligned on a - # 16-byte boundary. Hopefully, some system, possibly memory-mapping, - # can take advantage of our premature optimization. - current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline - topad = 16 - (current_header_len % 16) - header = header + ' '*topad + '\n' header = asbytes(_filter_header(header)) - hlen = len(header) - if hlen < 256*256 and version in (None, (1, 0)): + hlen = len(header) + 1 # 1 for newline + padlen_v1 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<H') + hlen) % ARRAY_ALIGN) + padlen_v2 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<I') + hlen) % ARRAY_ALIGN) + + # Which version(s) we write depends on the total header size; v1 has a max of 65535 + if hlen + padlen_v1 < 2**16 and version in (None, (1, 0)): version = (1, 0) - header_prefix = magic(1, 0) + struct.pack('<H', hlen) - elif hlen < 2**32 and version in (None, (2, 0)): + header_prefix = magic(1, 0) + struct.pack('<H', hlen + padlen_v1) + topad = padlen_v1 + elif hlen + padlen_v2 < 2**32 and version in (None, (2, 0)): version = (2, 0) - header_prefix = magic(2, 0) + struct.pack('<I', hlen) + header_prefix = magic(2, 0) + struct.pack('<I', hlen + padlen_v2) + topad = padlen_v2 else: msg = "Header length %s too big for version=%s" msg %= (hlen, version) raise ValueError(msg) + # Pad the header with spaces and a final newline such that the magic + # string, the header-length short and the header are aligned on a + # ARRAY_ALIGN byte boundary. This supports memory mapping of dtypes + # aligned up to ARRAY_ALIGN on systems like Linux where mmap() + # offset must be page-aligned (i.e. the beginning of the file). + header = header + b' '*topad + b'\n' + fp.write(header_prefix) fp.write(header) return version @@ -447,7 +496,9 @@ def _filter_header(s): tokens = [] last_token_was_number = False - for token in tokenize.generate_tokens(StringIO(asstr(s)).read): + # adding newline as python 2.7.5 workaround + string = asstr(s) + "\n" + for token in tokenize.generate_tokens(StringIO(string).readline): token_type = token[0] token_string = token[1] if (last_token_was_number and @@ -457,7 +508,8 @@ def _filter_header(s): else: tokens.append(token) last_token_was_number = (token_type == tokenize.NUMBER) - return tokenize.untokenize(tokens) + # removing newline (see above) as python 2.7.5 workaround + return tokenize.untokenize(tokens)[:-1] def _read_array_header(fp, version): @@ -468,18 +520,18 @@ def _read_array_header(fp, version): # header. import struct if version == (1, 0): - hlength_str = _read_bytes(fp, 2, "array header length") - header_length = struct.unpack('<H', hlength_str)[0] - header = _read_bytes(fp, header_length, "array header") + hlength_type = '<H' elif version == (2, 0): - hlength_str = _read_bytes(fp, 4, "array header length") - header_length = struct.unpack('<I', hlength_str)[0] - header = _read_bytes(fp, header_length, "array header") + hlength_type = '<I' else: - raise ValueError("Invalid version %r" % version) + raise ValueError("Invalid version {!r}".format(version)) + + hlength_str = _read_bytes(fp, struct.calcsize(hlength_type), "array header length") + header_length = struct.unpack(hlength_type, hlength_str)[0] + header = _read_bytes(fp, header_length, "array header") # The header is a pretty-printed string representation of a literal - # Python dictionary with trailing newlines padded to a 16-byte + # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte # boundary. The keys are strings. # "shape" : tuple of int # "fortran_order" : bool @@ -488,29 +540,29 @@ def _read_array_header(fp, version): try: d = safe_eval(header) except SyntaxError as e: - msg = "Cannot parse header: %r\nException: %r" - raise ValueError(msg % (header, e)) + msg = "Cannot parse header: {!r}\nException: {!r}" + raise ValueError(msg.format(header, e)) if not isinstance(d, dict): - msg = "Header is not a dictionary: %r" - raise ValueError(msg % d) + msg = "Header is not a dictionary: {!r}" + raise ValueError(msg.format(d)) keys = sorted(d.keys()) if keys != ['descr', 'fortran_order', 'shape']: - msg = "Header does not contain the correct keys: %r" - raise ValueError(msg % (keys,)) + msg = "Header does not contain the correct keys: {!r}" + raise ValueError(msg.format(keys)) # Sanity-check the values. if (not isinstance(d['shape'], tuple) or not numpy.all([isinstance(x, (int, long)) for x in d['shape']])): - msg = "shape is not valid: %r" - raise ValueError(msg % (d['shape'],)) + msg = "shape is not valid: {!r}" + raise ValueError(msg.format(d['shape'])) if not isinstance(d['fortran_order'], bool): - msg = "fortran_order is not a valid bool: %r" - raise ValueError(msg % (d['fortran_order'],)) + msg = "fortran_order is not a valid bool: {!r}" + raise ValueError(msg.format(d['fortran_order'])) try: - dtype = numpy.dtype(d['descr']) + dtype = descr_to_dtype(d['descr']) except TypeError as e: - msg = "descr is not a valid dtype descriptor: %r" - raise ValueError(msg % (d['descr'],)) + msg = "descr is not a valid dtype descriptor: {!r}" + raise ValueError(msg.format(d['descr'])) return d['shape'], d['fortran_order'], dtype @@ -692,7 +744,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, Parameters ---------- - filename : str + filename : str or path-like The name of the file on disk. This may *not* be a file-like object. mode : str, optional @@ -733,9 +785,9 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, memmap """ - if not isinstance(filename, basestring): - raise ValueError("Filename must be a string. Memmap cannot use" - " existing file handles.") + if isfileobj(filename): + raise ValueError("Filename must be a string or a path-like object." + " Memmap cannot use existing file handles.") if 'w' in mode: # We are creating the file, not reading it. @@ -753,7 +805,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, shape=shape, ) # If we got here, then it should be safe to create the file. - fp = open(filename, mode+'b') + fp = open(os_fspath(filename), mode+'b') try: used_ver = _write_array_header(fp, d, version) # this warning can be removed when 1.9 has aged enough @@ -765,7 +817,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, fp.close() else: # Read the header of the file first. - fp = open(filename, 'rb') + fp = open(os_fspath(filename), 'rb') try: version = read_magic(fp) _check_version(version) |