Merge remote-tracking branch 'upstream/master' into fix-if-fields

author: Eric Wieser <wieser.eric@gmail.com> 2019-08-19 19:16:44 -0500
committer: Eric Wieser <wieser.eric@gmail.com> 2019-08-19 19:16:44 -0500
commit: 0f5e376d3eb6118b783cdd3ecd27722c2d1934ba (patch)
tree: c44850b579cbd27993c45dda1a7922e2d109b24f /numpy/lib/format.py
parent: 483f565d85dadc899f94710531fba8355d554d59 (diff)
parent: 98bdde643af6443d68a8c6233807b75bd3f0ed80 (diff)
download: numpy-0f5e376d3eb6118b783cdd3ecd27722c2d1934ba.tar.gz
1 files changed, 110 insertions, 86 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 10945e5e8..3bf818812 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -146,10 +146,17 @@ The description of the fourth element of the header therefore has become:
 "The next 4 bytes form a little-endian unsigned int: the length of the header
 data HEADER_LEN."
 
+Format Version 3.0
+------------------
+
+This version replaces the ASCII string (which in practice was latin1) with
+a utf8-encoded string, so supports structured types with any unicode field
+names.
+
 Notes
 -----
 The ``.npy`` format, including motivation for creating it and a comparison of
-alternatives, is described in the `"npy-format" NEP 
+alternatives, is described in the `"npy-format" NEP
 <https://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have
 evolved with time and this document is more current.
 
@@ -162,9 +169,8 @@ import io
 import warnings
 from numpy.lib.utils import safe_eval
 from numpy.compat import (
-    asbytes, asstr, isfileobj, long, os_fspath
+    isfileobj, long, os_fspath, pickle
     )
-from numpy.core.numeric import pickle
 
 
 MAGIC_PREFIX = b'\x93NUMPY'
@@ -174,10 +180,16 @@ BUFFER_SIZE = 2**18  # size of buffer for reading npz files in bytes
 
 # difference between version 1.0 and 2.0 is a 4 byte (I) header length
 # instead of 2 bytes (H) allowing storage of large structured arrays
+_header_size_info = {
+    (1, 0): ('<H', 'latin1'),
+    (2, 0): ('<I', 'latin1'),
+    (3, 0): ('<I', 'utf8'),
+}
+
 
 def _check_version(version):
-    if version not in [(1, 0), (2, 0), None]:
-        msg = "we only support format version (1,0) and (2, 0), not %s"
+    if version not in [(1, 0), (2, 0), (3, 0), None]:
+        msg = "we only support format version (1,0), (2,0), and (3,0), not %s"
         raise ValueError(msg % (version,))
 
 def magic(major, minor):
@@ -262,15 +274,19 @@ def dtype_to_descr(dtype):
 def descr_to_dtype(descr):
     '''
     descr may be stored as dtype.descr, which is a list of
-    (name, format, [shape]) tuples. Offsets are not explicitly saved, rather
-    empty fields with name,format == '', '|Vn' are added as padding.
+    (name, format, [shape]) tuples where format may be a str or a tuple.
+    Offsets are not explicitly saved, rather empty fields with
+    name, format == '', '|Vn' are added as padding.
 
     This function reverses the process, eliminating the empty padding fields.
     '''
-    if isinstance(descr, (str, dict)):
+    if isinstance(descr, str):
         # No padding removal needed
         return numpy.dtype(descr)
-
+    elif isinstance(descr, tuple):
+        # subtype, will always have a shape descr[1]
+        dt = descr_to_dtype(descr[0])
+        return numpy.dtype((dt, descr[1]))
     fields = []
     offset = 0
     for field in descr:
@@ -323,6 +339,56 @@ def header_data_from_array_1_0(array):
     d['descr'] = dtype_to_descr(array.dtype)
     return d
 
+
+def _wrap_header(header, version):
+    """
+    Takes a stringified header, and attaches the prefix and padding to it
+    """
+    import struct
+    assert version is not None
+    fmt, encoding = _header_size_info[version]
+    if not isinstance(header, bytes):  # always true on python 3
+        header = header.encode(encoding)
+    hlen = len(header) + 1
+    padlen = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize(fmt) + hlen) % ARRAY_ALIGN)
+    try:
+        header_prefix = magic(*version) + struct.pack(fmt, hlen + padlen)
+    except struct.error:
+        msg = "Header length {} too big for version={}".format(hlen, version)
+        raise ValueError(msg)
+
+    # Pad the header with spaces and a final newline such that the magic
+    # string, the header-length short and the header are aligned on a
+    # ARRAY_ALIGN byte boundary.  This supports memory mapping of dtypes
+    # aligned up to ARRAY_ALIGN on systems like Linux where mmap()
+    # offset must be page-aligned (i.e. the beginning of the file).
+    return header_prefix + header + b' '*padlen + b'\n'
+
+
+def _wrap_header_guess_version(header):
+    """
+    Like `_wrap_header`, but chooses an appropriate version given the contents
+    """
+    try:
+        return _wrap_header(header, (1, 0))
+    except ValueError:
+        pass
+
+    try:
+        ret = _wrap_header(header, (2, 0))
+    except UnicodeEncodeError:
+        pass
+    else:
+        warnings.warn("Stored array in format 2.0. It can only be"
+                      "read by NumPy >= 1.9", UserWarning, stacklevel=2)
+        return ret
+
+    header = _wrap_header(header, (3, 0))
+    warnings.warn("Stored array in format 3.0. It can only be "
+                  "read by NumPy >= 1.17", UserWarning, stacklevel=2)
+    return header
+
+
 def _write_array_header(fp, d, version=None):
     """ Write the header for an array and returns the version used
 
@@ -336,48 +402,19 @@ def _write_array_header(fp, d, version=None):
         None means use oldest that works
         explicit version will raise a ValueError if the format does not
         allow saving this data.  Default: None
-    Returns
-    -------
-    version : tuple of int
-        the file version which needs to be used to store the data
     """
-    import struct
     header = ["{"]
     for key, value in sorted(d.items()):
         # Need to use repr here, since we eval these when reading
         header.append("'%s': %s, " % (key, repr(value)))
     header.append("}")
     header = "".join(header)
-    header = asbytes(_filter_header(header))
-
-    hlen = len(header) + 1 # 1 for newline
-    padlen_v1 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<H') + hlen) % ARRAY_ALIGN)
-    padlen_v2 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<I') + hlen) % ARRAY_ALIGN)
-
-    # Which version(s) we write depends on the total header size; v1 has a max of 65535
-    if hlen + padlen_v1 < 2**16 and version in (None, (1, 0)):
-        version = (1, 0)
-        header_prefix = magic(1, 0) + struct.pack('<H', hlen + padlen_v1)
-        topad = padlen_v1
-    elif hlen + padlen_v2 < 2**32 and version in (None, (2, 0)):
-        version = (2, 0)
-        header_prefix = magic(2, 0) + struct.pack('<I', hlen + padlen_v2)
-        topad = padlen_v2
+    header = _filter_header(header)
+    if version is None:
+        header = _wrap_header_guess_version(header)
     else:
-        msg = "Header length %s too big for version=%s"
-        msg %= (hlen, version)
-        raise ValueError(msg)
-
-    # Pad the header with spaces and a final newline such that the magic
-    # string, the header-length short and the header are aligned on a
-    # ARRAY_ALIGN byte boundary.  This supports memory mapping of dtypes
-    # aligned up to ARRAY_ALIGN on systems like Linux where mmap()
-    # offset must be page-aligned (i.e. the beginning of the file).
-    header = header + b' '*topad + b'\n'
-
-    fp.write(header_prefix)
+        header = _wrap_header(header, version)
     fp.write(header)
-    return version
 
 def write_array_header_1_0(fp, d):
     """ Write the header for an array using the 1.0 format.
@@ -480,7 +517,7 @@ def _filter_header(s):
 
     Parameters
     ----------
-    s : byte string
+    s : string
         Npy file header.
 
     Returns
@@ -498,7 +535,7 @@ def _filter_header(s):
     tokens = []
     last_token_was_number = False
     # adding newline as python 2.7.5 workaround
-    string = asstr(s) + "\n"
+    string = s + "\n"
     for token in tokenize.generate_tokens(StringIO(string).readline):
         token_type = token[0]
         token_string = token[1]
@@ -520,16 +557,15 @@ def _read_array_header(fp, version):
     # Read an unsigned, little-endian short int which has the length of the
     # header.
     import struct
-    if version == (1, 0):
-        hlength_type = '<H'
-    elif version == (2, 0):
-        hlength_type = '<I'
-    else:
-        raise ValueError("Invalid version %r" % version)
+    hinfo = _header_size_info.get(version)
+    if hinfo is None:
+        raise ValueError("Invalid version {!r}".format(version))
+    hlength_type, encoding = hinfo
 
     hlength_str = _read_bytes(fp, struct.calcsize(hlength_type), "array header length")
     header_length = struct.unpack(hlength_type, hlength_str)[0]
     header = _read_bytes(fp, header_length, "array header")
+    header = header.decode(encoding)
 
     # The header is a pretty-printed string representation of a literal
     # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte
@@ -541,29 +577,29 @@ def _read_array_header(fp, version):
     try:
         d = safe_eval(header)
     except SyntaxError as e:
-        msg = "Cannot parse header: %r\nException: %r"
-        raise ValueError(msg % (header, e))
+        msg = "Cannot parse header: {!r}\nException: {!r}"
+        raise ValueError(msg.format(header, e))
     if not isinstance(d, dict):
-        msg = "Header is not a dictionary: %r"
-        raise ValueError(msg % d)
+        msg = "Header is not a dictionary: {!r}"
+        raise ValueError(msg.format(d))
     keys = sorted(d.keys())
     if keys != ['descr', 'fortran_order', 'shape']:
-        msg = "Header does not contain the correct keys: %r"
-        raise ValueError(msg % (keys,))
+        msg = "Header does not contain the correct keys: {!r}"
+        raise ValueError(msg.format(keys))
 
     # Sanity-check the values.
     if (not isinstance(d['shape'], tuple) or
             not numpy.all([isinstance(x, (int, long)) for x in d['shape']])):
-        msg = "shape is not valid: %r"
-        raise ValueError(msg % (d['shape'],))
+        msg = "shape is not valid: {!r}"
+        raise ValueError(msg.format(d['shape']))
     if not isinstance(d['fortran_order'], bool):
-        msg = "fortran_order is not a valid bool: %r"
-        raise ValueError(msg % (d['fortran_order'],))
+        msg = "fortran_order is not a valid bool: {!r}"
+        raise ValueError(msg.format(d['fortran_order']))
     try:
         dtype = descr_to_dtype(d['descr'])
     except TypeError as e:
-        msg = "descr is not a valid dtype descriptor: %r"
-        raise ValueError(msg % (d['descr'],))
+        msg = "descr is not a valid dtype descriptor: {!r}"
+        raise ValueError(msg.format(d['descr']))
 
     return d['shape'], d['fortran_order'], dtype
 
@@ -604,12 +640,7 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
 
     """
     _check_version(version)
-    used_ver = _write_array_header(fp, header_data_from_array_1_0(array),
-                                   version)
-    # this warning can be removed when 1.9 has aged enough
-    if version != (2, 0) and used_ver == (2, 0):
-        warnings.warn("Stored array in format 2.0. It can only be"
-                      "read by NumPy >= 1.9", UserWarning, stacklevel=2)
+    _write_array_header(fp, header_data_from_array_1_0(array), version)
 
     if array.itemsize == 0:
         buffersize = 0
@@ -619,14 +650,13 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
 
     if array.dtype.hasobject:
         # We contain Python objects so we cannot write out the data
-        # directly.  Instead, we will pickle it out with version 2 of the
-        # pickle protocol.
+        # directly.  Instead, we will pickle it out
         if not allow_pickle:
             raise ValueError("Object arrays cannot be saved when "
                              "allow_pickle=False")
         if pickle_kwargs is None:
             pickle_kwargs = {}
-        pickle.dump(array, fp, protocol=2, **pickle_kwargs)
+        pickle.dump(array, fp, protocol=3, **pickle_kwargs)
     elif array.flags.f_contiguous and not array.flags.c_contiguous:
         if isfileobj(fp):
             array.T.tofile(fp)
@@ -645,7 +675,7 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
                 fp.write(chunk.tobytes('C'))
 
 
-def read_array(fp, allow_pickle=True, pickle_kwargs=None):
+def read_array(fp, allow_pickle=False, pickle_kwargs=None):
     """
     Read an array from an NPY file.
 
@@ -655,7 +685,11 @@ def read_array(fp, allow_pickle=True, pickle_kwargs=None):
         If this is not a real file object, then this may take extra memory
         and time.
     allow_pickle : bool, optional
-        Whether to allow reading pickled data. Default: True
+        Whether to allow writing pickled data. Default: False
+
+        .. versionchanged:: 1.16.3
+            Made default False in response to CVE-2019-6446.
+
     pickle_kwargs : dict
         Additional keyword arguments to pass to pickle.load. These are only
         useful when loading object arrays saved on Python 2 when using
@@ -806,20 +840,12 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
             shape=shape,
         )
         # If we got here, then it should be safe to create the file.
-        fp = open(os_fspath(filename), mode+'b')
-        try:
-            used_ver = _write_array_header(fp, d, version)
-            # this warning can be removed when 1.9 has aged enough
-            if version != (2, 0) and used_ver == (2, 0):
-                warnings.warn("Stored array in format 2.0. It can only be"
-                              "read by NumPy >= 1.9", UserWarning, stacklevel=2)
+        with open(os_fspath(filename), mode+'b') as fp:
+            _write_array_header(fp, d, version)
             offset = fp.tell()
-        finally:
-            fp.close()
     else:
         # Read the header of the file first.
-        fp = open(os_fspath(filename), 'rb')
-        try:
+        with open(os_fspath(filename), 'rb') as fp:
             version = read_magic(fp)
             _check_version(version)
 
@@ -828,8 +854,6 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
                 msg = "Array can't be memory-mapped: Python objects in dtype."
                 raise ValueError(msg)
             offset = fp.tell()
-        finally:
-            fp.close()
 
     if fortran_order:
         order = 'F'
author	Eric Wieser <wieser.eric@gmail.com>	2019-08-19 19:16:44 -0500
committer	Eric Wieser <wieser.eric@gmail.com>	2019-08-19 19:16:44 -0500
commit	0f5e376d3eb6118b783cdd3ecd27722c2d1934ba (patch)
tree	c44850b579cbd27993c45dda1a7922e2d109b24f /numpy/lib/format.py
parent	483f565d85dadc899f94710531fba8355d554d59 (diff)
parent	98bdde643af6443d68a8c6233807b75bd3f0ed80 (diff)
download	numpy-0f5e376d3eb6118b783cdd3ecd27722c2d1934ba.tar.gz