summaryrefslogtreecommitdiff
path: root/numpy/lib/format.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r--numpy/lib/format.py158
1 files changed, 105 insertions, 53 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 633aee675..4da1022ca 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -1,5 +1,10 @@
"""
-Define a simple format for saving numpy arrays to disk with the full
+Binary serialization
+
+NPY format
+==========
+
+A simple format for saving numpy arrays to disk with the full
information about them.
The ``.npy`` format is the standard binary file format in NumPy for
@@ -100,9 +105,9 @@ the header data HEADER_LEN.
The next HEADER_LEN bytes form the header data describing the array's
format. It is an ASCII string which contains a Python literal expression
of a dictionary. It is terminated by a newline (``\\n``) and padded with
-spaces (``\\x20``) to make the total length of
-``magic string + 4 + HEADER_LEN`` be evenly divisible by 16 for alignment
-purposes.
+spaces (``\\x20``) to make the total of
+``len(magic string) + 2 + len(length) + HEADER_LEN`` be evenly divisible
+by 64 for alignment purposes.
The dictionary contains three keys:
@@ -143,8 +148,10 @@ data HEADER_LEN."
Notes
-----
-The ``.npy`` format, including reasons for creating it and a comparison of
-alternatives, is described fully in the "npy-format" NEP.
+The ``.npy`` format, including motivation for creating it and a comparison of
+alternatives, is described in the `"npy-format" NEP
+<https://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have
+evolved with time and this document is more current.
"""
from __future__ import division, absolute_import, print_function
@@ -154,15 +161,14 @@ import sys
import io
import warnings
from numpy.lib.utils import safe_eval
-from numpy.compat import asbytes, asstr, isfileobj, long, basestring
+from numpy.compat import (
+ asbytes, asstr, isfileobj, long, os_fspath, pickle
+ )
-if sys.version_info[0] >= 3:
- import pickle
-else:
- import cPickle as pickle
-MAGIC_PREFIX = asbytes('\x93NUMPY')
+MAGIC_PREFIX = b'\x93NUMPY'
MAGIC_LEN = len(MAGIC_PREFIX) + 2
+ARRAY_ALIGN = 64 # plausible values are powers of 2 between 16 and 4096
BUFFER_SIZE = 2**18 # size of buffer for reading npz files in bytes
# difference between version 1.0 and 2.0 is a 4 byte (I) header length
@@ -252,6 +258,43 @@ def dtype_to_descr(dtype):
else:
return dtype.str
+def descr_to_dtype(descr):
+ '''
+ descr may be stored as dtype.descr, which is a list of
+ (name, format, [shape]) tuples. Offsets are not explicitly saved, rather
+ empty fields with name,format == '', '|Vn' are added as padding.
+
+ This function reverses the process, eliminating the empty padding fields.
+ '''
+ if isinstance(descr, (str, dict)):
+ # No padding removal needed
+ return numpy.dtype(descr)
+
+ fields = []
+ offset = 0
+ for field in descr:
+ if len(field) == 2:
+ name, descr_str = field
+ dt = descr_to_dtype(descr_str)
+ else:
+ name, descr_str, shape = field
+ dt = numpy.dtype((descr_to_dtype(descr_str), shape))
+
+ # Ignore padding bytes, which will be void bytes with '' as name
+ # Once support for blank names is removed, only "if name == ''" needed)
+ is_pad = (name == '' and dt.type is numpy.void and dt.names is None)
+ if not is_pad:
+ fields.append((name, dt, offset))
+
+ offset += dt.itemsize
+
+ names, formats, offsets = zip(*fields)
+ # names may be (title, names) tuples
+ nametups = (n if isinstance(n, tuple) else (None, n) for n in names)
+ titles, names = zip(*nametups)
+ return numpy.dtype({'names': names, 'formats': formats, 'titles': titles,
+ 'offsets': offsets, 'itemsize': offset})
+
def header_data_from_array_1_0(array):
""" Get the dictionary of header metadata from a numpy.ndarray.
@@ -304,27 +347,33 @@ def _write_array_header(fp, d, version=None):
header.append("'%s': %s, " % (key, repr(value)))
header.append("}")
header = "".join(header)
- # Pad the header with spaces and a final newline such that the magic
- # string, the header-length short and the header are aligned on a
- # 16-byte boundary. Hopefully, some system, possibly memory-mapping,
- # can take advantage of our premature optimization.
- current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline
- topad = 16 - (current_header_len % 16)
- header = header + ' '*topad + '\n'
header = asbytes(_filter_header(header))
- hlen = len(header)
- if hlen < 256*256 and version in (None, (1, 0)):
+ hlen = len(header) + 1 # 1 for newline
+ padlen_v1 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<H') + hlen) % ARRAY_ALIGN)
+ padlen_v2 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<I') + hlen) % ARRAY_ALIGN)
+
+ # Which version(s) we write depends on the total header size; v1 has a max of 65535
+ if hlen + padlen_v1 < 2**16 and version in (None, (1, 0)):
version = (1, 0)
- header_prefix = magic(1, 0) + struct.pack('<H', hlen)
- elif hlen < 2**32 and version in (None, (2, 0)):
+ header_prefix = magic(1, 0) + struct.pack('<H', hlen + padlen_v1)
+ topad = padlen_v1
+ elif hlen + padlen_v2 < 2**32 and version in (None, (2, 0)):
version = (2, 0)
- header_prefix = magic(2, 0) + struct.pack('<I', hlen)
+ header_prefix = magic(2, 0) + struct.pack('<I', hlen + padlen_v2)
+ topad = padlen_v2
else:
msg = "Header length %s too big for version=%s"
msg %= (hlen, version)
raise ValueError(msg)
+ # Pad the header with spaces and a final newline such that the magic
+ # string, the header-length short and the header are aligned on a
+ # ARRAY_ALIGN byte boundary. This supports memory mapping of dtypes
+ # aligned up to ARRAY_ALIGN on systems like Linux where mmap()
+ # offset must be page-aligned (i.e. the beginning of the file).
+ header = header + b' '*topad + b'\n'
+
fp.write(header_prefix)
fp.write(header)
return version
@@ -447,7 +496,9 @@ def _filter_header(s):
tokens = []
last_token_was_number = False
- for token in tokenize.generate_tokens(StringIO(asstr(s)).read):
+ # adding newline as python 2.7.5 workaround
+ string = asstr(s) + "\n"
+ for token in tokenize.generate_tokens(StringIO(string).readline):
token_type = token[0]
token_string = token[1]
if (last_token_was_number and
@@ -457,7 +508,8 @@ def _filter_header(s):
else:
tokens.append(token)
last_token_was_number = (token_type == tokenize.NUMBER)
- return tokenize.untokenize(tokens)
+ # removing newline (see above) as python 2.7.5 workaround
+ return tokenize.untokenize(tokens)[:-1]
def _read_array_header(fp, version):
@@ -468,18 +520,18 @@ def _read_array_header(fp, version):
# header.
import struct
if version == (1, 0):
- hlength_str = _read_bytes(fp, 2, "array header length")
- header_length = struct.unpack('<H', hlength_str)[0]
- header = _read_bytes(fp, header_length, "array header")
+ hlength_type = '<H'
elif version == (2, 0):
- hlength_str = _read_bytes(fp, 4, "array header length")
- header_length = struct.unpack('<I', hlength_str)[0]
- header = _read_bytes(fp, header_length, "array header")
+ hlength_type = '<I'
else:
- raise ValueError("Invalid version %r" % version)
+ raise ValueError("Invalid version {!r}".format(version))
+
+ hlength_str = _read_bytes(fp, struct.calcsize(hlength_type), "array header length")
+ header_length = struct.unpack(hlength_type, hlength_str)[0]
+ header = _read_bytes(fp, header_length, "array header")
# The header is a pretty-printed string representation of a literal
- # Python dictionary with trailing newlines padded to a 16-byte
+ # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte
# boundary. The keys are strings.
# "shape" : tuple of int
# "fortran_order" : bool
@@ -488,29 +540,29 @@ def _read_array_header(fp, version):
try:
d = safe_eval(header)
except SyntaxError as e:
- msg = "Cannot parse header: %r\nException: %r"
- raise ValueError(msg % (header, e))
+ msg = "Cannot parse header: {!r}\nException: {!r}"
+ raise ValueError(msg.format(header, e))
if not isinstance(d, dict):
- msg = "Header is not a dictionary: %r"
- raise ValueError(msg % d)
+ msg = "Header is not a dictionary: {!r}"
+ raise ValueError(msg.format(d))
keys = sorted(d.keys())
if keys != ['descr', 'fortran_order', 'shape']:
- msg = "Header does not contain the correct keys: %r"
- raise ValueError(msg % (keys,))
+ msg = "Header does not contain the correct keys: {!r}"
+ raise ValueError(msg.format(keys))
# Sanity-check the values.
if (not isinstance(d['shape'], tuple) or
not numpy.all([isinstance(x, (int, long)) for x in d['shape']])):
- msg = "shape is not valid: %r"
- raise ValueError(msg % (d['shape'],))
+ msg = "shape is not valid: {!r}"
+ raise ValueError(msg.format(d['shape']))
if not isinstance(d['fortran_order'], bool):
- msg = "fortran_order is not a valid bool: %r"
- raise ValueError(msg % (d['fortran_order'],))
+ msg = "fortran_order is not a valid bool: {!r}"
+ raise ValueError(msg.format(d['fortran_order']))
try:
- dtype = numpy.dtype(d['descr'])
+ dtype = descr_to_dtype(d['descr'])
except TypeError as e:
- msg = "descr is not a valid dtype descriptor: %r"
- raise ValueError(msg % (d['descr'],))
+ msg = "descr is not a valid dtype descriptor: {!r}"
+ raise ValueError(msg.format(d['descr']))
return d['shape'], d['fortran_order'], dtype
@@ -692,7 +744,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
Parameters
----------
- filename : str
+ filename : str or path-like
The name of the file on disk. This may *not* be a file-like
object.
mode : str, optional
@@ -733,9 +785,9 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
memmap
"""
- if not isinstance(filename, basestring):
- raise ValueError("Filename must be a string. Memmap cannot use"
- " existing file handles.")
+ if isfileobj(filename):
+ raise ValueError("Filename must be a string or a path-like object."
+ " Memmap cannot use existing file handles.")
if 'w' in mode:
# We are creating the file, not reading it.
@@ -753,7 +805,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
shape=shape,
)
# If we got here, then it should be safe to create the file.
- fp = open(filename, mode+'b')
+ fp = open(os_fspath(filename), mode+'b')
try:
used_ver = _write_array_header(fp, d, version)
# this warning can be removed when 1.9 has aged enough
@@ -765,7 +817,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
fp.close()
else:
# Read the header of the file first.
- fp = open(filename, 'rb')
+ fp = open(os_fspath(filename), 'rb')
try:
version = read_magic(fp)
_check_version(version)