Merge branch 'master' into ix_-preserve-type

author: Eric Wieser <wieser.eric@gmail.com> 2018-07-31 00:41:28 -0700
committer: GitHub <noreply@github.com> 2018-07-31 00:41:28 -0700
commit: 7f4579279a6a6aa07df664b901afa36ab3fc5ce0 (patch)
tree: 3524c05c661f4948eabf066b46b5ad3aaf6ad617 /numpy/lib/format.py
parent: 24960daf3e326591047eb099af840da6e95d0910 (diff)
parent: 9bb569c4e0e1cf08128179d157bdab10c8706a97 (diff)
download: numpy-7f4579279a6a6aa07df664b901afa36ab3fc5ce0.tar.gz
1 files changed, 45 insertions, 28 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 633aee675..ef5ec57e3 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -1,5 +1,10 @@
 """
-Define a simple format for saving numpy arrays to disk with the full
+Binary serialization
+
+NPY format
+==========
+
+A simple format for saving numpy arrays to disk with the full
 information about them.
 
 The ``.npy`` format is the standard binary file format in NumPy for
@@ -100,9 +105,9 @@ the header data HEADER_LEN.
 The next HEADER_LEN bytes form the header data describing the array's
 format. It is an ASCII string which contains a Python literal expression
 of a dictionary. It is terminated by a newline (``\\n``) and padded with
-spaces (``\\x20``) to make the total length of
-``magic string + 4 + HEADER_LEN`` be evenly divisible by 16 for alignment
-purposes.
+spaces (``\\x20``) to make the total of
+``len(magic string) + 2 + len(length) + HEADER_LEN`` be evenly divisible
+by 64 for alignment purposes.
 
 The dictionary contains three keys:
 
@@ -143,8 +148,10 @@ data HEADER_LEN."
 
 Notes
 -----
-The ``.npy`` format, including reasons for creating it and a comparison of
-alternatives, is described fully in the "npy-format" NEP.
+The ``.npy`` format, including motivation for creating it and a comparison of
+alternatives, is described in the `"npy-format" NEP 
+<https://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have
+evolved with time and this document is more current.
 
 """
 from __future__ import division, absolute_import, print_function
@@ -161,8 +168,9 @@ if sys.version_info[0] >= 3:
 else:
     import cPickle as pickle
 
-MAGIC_PREFIX = asbytes('\x93NUMPY')
+MAGIC_PREFIX = b'\x93NUMPY'
 MAGIC_LEN = len(MAGIC_PREFIX) + 2
+ARRAY_ALIGN = 64 # plausible values are powers of 2 between 16 and 4096
 BUFFER_SIZE = 2**18  # size of buffer for reading npz files in bytes
 
 # difference between version 1.0 and 2.0 is a 4 byte (I) header length
@@ -304,27 +312,33 @@ def _write_array_header(fp, d, version=None):
         header.append("'%s': %s, " % (key, repr(value)))
     header.append("}")
     header = "".join(header)
-    # Pad the header with spaces and a final newline such that the magic
-    # string, the header-length short and the header are aligned on a
-    # 16-byte boundary.  Hopefully, some system, possibly memory-mapping,
-    # can take advantage of our premature optimization.
-    current_header_len = MAGIC_LEN + 2 + len(header) + 1  # 1 for the newline
-    topad = 16 - (current_header_len % 16)
-    header = header + ' '*topad + '\n'
     header = asbytes(_filter_header(header))
 
-    hlen = len(header)
-    if hlen < 256*256 and version in (None, (1, 0)):
+    hlen = len(header) + 1 # 1 for newline
+    padlen_v1 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<H') + hlen) % ARRAY_ALIGN)
+    padlen_v2 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<I') + hlen) % ARRAY_ALIGN)
+
+    # Which version(s) we write depends on the total header size; v1 has a max of 65535
+    if hlen + padlen_v1 < 2**16 and version in (None, (1, 0)):
         version = (1, 0)
-        header_prefix = magic(1, 0) + struct.pack('<H', hlen)
-    elif hlen < 2**32 and version in (None, (2, 0)):
+        header_prefix = magic(1, 0) + struct.pack('<H', hlen + padlen_v1)
+        topad = padlen_v1
+    elif hlen + padlen_v2 < 2**32 and version in (None, (2, 0)):
         version = (2, 0)
-        header_prefix = magic(2, 0) + struct.pack('<I', hlen)
+        header_prefix = magic(2, 0) + struct.pack('<I', hlen + padlen_v2)
+        topad = padlen_v2
     else:
         msg = "Header length %s too big for version=%s"
         msg %= (hlen, version)
         raise ValueError(msg)
 
+    # Pad the header with spaces and a final newline such that the magic
+    # string, the header-length short and the header are aligned on a
+    # ARRAY_ALIGN byte boundary.  This supports memory mapping of dtypes
+    # aligned up to ARRAY_ALIGN on systems like Linux where mmap()
+    # offset must be page-aligned (i.e. the beginning of the file).
+    header = header + b' '*topad + b'\n'
+
     fp.write(header_prefix)
     fp.write(header)
     return version
@@ -447,7 +461,9 @@ def _filter_header(s):
 
     tokens = []
     last_token_was_number = False
-    for token in tokenize.generate_tokens(StringIO(asstr(s)).read):
+    # adding newline as python 2.7.5 workaround
+    string = asstr(s) + "\n"
+    for token in tokenize.generate_tokens(StringIO(string).readline):
         token_type = token[0]
         token_string = token[1]
         if (last_token_was_number and
@@ -457,7 +473,8 @@ def _filter_header(s):
         else:
             tokens.append(token)
         last_token_was_number = (token_type == tokenize.NUMBER)
-    return tokenize.untokenize(tokens)
+    # removing newline (see above) as python 2.7.5 workaround
+    return tokenize.untokenize(tokens)[:-1]
 
 
 def _read_array_header(fp, version):
@@ -468,18 +485,18 @@ def _read_array_header(fp, version):
     # header.
     import struct
     if version == (1, 0):
-        hlength_str = _read_bytes(fp, 2, "array header length")
-        header_length = struct.unpack('<H', hlength_str)[0]
-        header = _read_bytes(fp, header_length, "array header")
+        hlength_type = '<H'
     elif version == (2, 0):
-        hlength_str = _read_bytes(fp, 4, "array header length")
-        header_length = struct.unpack('<I', hlength_str)[0]
-        header = _read_bytes(fp, header_length, "array header")
+        hlength_type = '<I'
     else:
         raise ValueError("Invalid version %r" % version)
 
+    hlength_str = _read_bytes(fp, struct.calcsize(hlength_type), "array header length")
+    header_length = struct.unpack(hlength_type, hlength_str)[0]
+    header = _read_bytes(fp, header_length, "array header")
+
     # The header is a pretty-printed string representation of a literal
-    # Python dictionary with trailing newlines padded to a 16-byte
+    # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte
     # boundary. The keys are strings.
     #   "shape" : tuple of int
     #   "fortran_order" : bool
author	Eric Wieser <wieser.eric@gmail.com>	2018-07-31 00:41:28 -0700
committer	GitHub <noreply@github.com>	2018-07-31 00:41:28 -0700
commit	7f4579279a6a6aa07df664b901afa36ab3fc5ce0 (patch)
tree	3524c05c661f4948eabf066b46b5ad3aaf6ad617 /numpy/lib/format.py
parent	24960daf3e326591047eb099af840da6e95d0910 (diff)
parent	9bb569c4e0e1cf08128179d157bdab10c8706a97 (diff)
download	numpy-7f4579279a6a6aa07df664b901afa36ab3fc5ce0.tar.gz