summaryrefslogtreecommitdiff
path: root/numpy/lib/format.py
diff options
context:
space:
mode:
authorEric Wieser <wieser.eric@gmail.com>2019-08-19 19:16:44 -0500
committerEric Wieser <wieser.eric@gmail.com>2019-08-19 19:16:44 -0500
commit0f5e376d3eb6118b783cdd3ecd27722c2d1934ba (patch)
treec44850b579cbd27993c45dda1a7922e2d109b24f /numpy/lib/format.py
parent483f565d85dadc899f94710531fba8355d554d59 (diff)
parent98bdde643af6443d68a8c6233807b75bd3f0ed80 (diff)
downloadnumpy-0f5e376d3eb6118b783cdd3ecd27722c2d1934ba.tar.gz
Merge remote-tracking branch 'upstream/master' into fix-if-fields
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r--numpy/lib/format.py196
1 files changed, 110 insertions, 86 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 10945e5e8..3bf818812 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -146,10 +146,17 @@ The description of the fourth element of the header therefore has become:
"The next 4 bytes form a little-endian unsigned int: the length of the header
data HEADER_LEN."
+Format Version 3.0
+------------------
+
+This version replaces the ASCII string (which in practice was latin1) with
+a utf8-encoded string, so supports structured types with any unicode field
+names.
+
Notes
-----
The ``.npy`` format, including motivation for creating it and a comparison of
-alternatives, is described in the `"npy-format" NEP
+alternatives, is described in the `"npy-format" NEP
<https://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have
evolved with time and this document is more current.
@@ -162,9 +169,8 @@ import io
import warnings
from numpy.lib.utils import safe_eval
from numpy.compat import (
- asbytes, asstr, isfileobj, long, os_fspath
+ isfileobj, long, os_fspath, pickle
)
-from numpy.core.numeric import pickle
MAGIC_PREFIX = b'\x93NUMPY'
@@ -174,10 +180,16 @@ BUFFER_SIZE = 2**18 # size of buffer for reading npz files in bytes
# difference between version 1.0 and 2.0 is a 4 byte (I) header length
# instead of 2 bytes (H) allowing storage of large structured arrays
+_header_size_info = {
+ (1, 0): ('<H', 'latin1'),
+ (2, 0): ('<I', 'latin1'),
+ (3, 0): ('<I', 'utf8'),
+}
+
def _check_version(version):
- if version not in [(1, 0), (2, 0), None]:
- msg = "we only support format version (1,0) and (2, 0), not %s"
+ if version not in [(1, 0), (2, 0), (3, 0), None]:
+ msg = "we only support format version (1,0), (2,0), and (3,0), not %s"
raise ValueError(msg % (version,))
def magic(major, minor):
@@ -262,15 +274,19 @@ def dtype_to_descr(dtype):
def descr_to_dtype(descr):
'''
descr may be stored as dtype.descr, which is a list of
- (name, format, [shape]) tuples. Offsets are not explicitly saved, rather
- empty fields with name,format == '', '|Vn' are added as padding.
+ (name, format, [shape]) tuples where format may be a str or a tuple.
+ Offsets are not explicitly saved, rather empty fields with
+ name, format == '', '|Vn' are added as padding.
This function reverses the process, eliminating the empty padding fields.
'''
- if isinstance(descr, (str, dict)):
+ if isinstance(descr, str):
# No padding removal needed
return numpy.dtype(descr)
-
+ elif isinstance(descr, tuple):
+ # subtype, will always have a shape descr[1]
+ dt = descr_to_dtype(descr[0])
+ return numpy.dtype((dt, descr[1]))
fields = []
offset = 0
for field in descr:
@@ -323,6 +339,56 @@ def header_data_from_array_1_0(array):
d['descr'] = dtype_to_descr(array.dtype)
return d
+
+def _wrap_header(header, version):
+ """
+ Takes a stringified header, and attaches the prefix and padding to it
+ """
+ import struct
+ assert version is not None
+ fmt, encoding = _header_size_info[version]
+ if not isinstance(header, bytes): # always true on python 3
+ header = header.encode(encoding)
+ hlen = len(header) + 1
+ padlen = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize(fmt) + hlen) % ARRAY_ALIGN)
+ try:
+ header_prefix = magic(*version) + struct.pack(fmt, hlen + padlen)
+ except struct.error:
+ msg = "Header length {} too big for version={}".format(hlen, version)
+ raise ValueError(msg)
+
+ # Pad the header with spaces and a final newline such that the magic
+ # string, the header-length short and the header are aligned on a
+ # ARRAY_ALIGN byte boundary. This supports memory mapping of dtypes
+ # aligned up to ARRAY_ALIGN on systems like Linux where mmap()
+ # offset must be page-aligned (i.e. the beginning of the file).
+ return header_prefix + header + b' '*padlen + b'\n'
+
+
+def _wrap_header_guess_version(header):
+ """
+ Like `_wrap_header`, but chooses an appropriate version given the contents
+ """
+ try:
+ return _wrap_header(header, (1, 0))
+ except ValueError:
+ pass
+
+ try:
+ ret = _wrap_header(header, (2, 0))
+ except UnicodeEncodeError:
+ pass
+ else:
+ warnings.warn("Stored array in format 2.0. It can only be"
+ "read by NumPy >= 1.9", UserWarning, stacklevel=2)
+ return ret
+
+ header = _wrap_header(header, (3, 0))
+ warnings.warn("Stored array in format 3.0. It can only be "
+ "read by NumPy >= 1.17", UserWarning, stacklevel=2)
+ return header
+
+
def _write_array_header(fp, d, version=None):
""" Write the header for an array and returns the version used
@@ -336,48 +402,19 @@ def _write_array_header(fp, d, version=None):
None means use oldest that works
explicit version will raise a ValueError if the format does not
allow saving this data. Default: None
- Returns
- -------
- version : tuple of int
- the file version which needs to be used to store the data
"""
- import struct
header = ["{"]
for key, value in sorted(d.items()):
# Need to use repr here, since we eval these when reading
header.append("'%s': %s, " % (key, repr(value)))
header.append("}")
header = "".join(header)
- header = asbytes(_filter_header(header))
-
- hlen = len(header) + 1 # 1 for newline
- padlen_v1 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<H') + hlen) % ARRAY_ALIGN)
- padlen_v2 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<I') + hlen) % ARRAY_ALIGN)
-
- # Which version(s) we write depends on the total header size; v1 has a max of 65535
- if hlen + padlen_v1 < 2**16 and version in (None, (1, 0)):
- version = (1, 0)
- header_prefix = magic(1, 0) + struct.pack('<H', hlen + padlen_v1)
- topad = padlen_v1
- elif hlen + padlen_v2 < 2**32 and version in (None, (2, 0)):
- version = (2, 0)
- header_prefix = magic(2, 0) + struct.pack('<I', hlen + padlen_v2)
- topad = padlen_v2
+ header = _filter_header(header)
+ if version is None:
+ header = _wrap_header_guess_version(header)
else:
- msg = "Header length %s too big for version=%s"
- msg %= (hlen, version)
- raise ValueError(msg)
-
- # Pad the header with spaces and a final newline such that the magic
- # string, the header-length short and the header are aligned on a
- # ARRAY_ALIGN byte boundary. This supports memory mapping of dtypes
- # aligned up to ARRAY_ALIGN on systems like Linux where mmap()
- # offset must be page-aligned (i.e. the beginning of the file).
- header = header + b' '*topad + b'\n'
-
- fp.write(header_prefix)
+ header = _wrap_header(header, version)
fp.write(header)
- return version
def write_array_header_1_0(fp, d):
""" Write the header for an array using the 1.0 format.
@@ -480,7 +517,7 @@ def _filter_header(s):
Parameters
----------
- s : byte string
+ s : string
Npy file header.
Returns
@@ -498,7 +535,7 @@ def _filter_header(s):
tokens = []
last_token_was_number = False
# adding newline as python 2.7.5 workaround
- string = asstr(s) + "\n"
+ string = s + "\n"
for token in tokenize.generate_tokens(StringIO(string).readline):
token_type = token[0]
token_string = token[1]
@@ -520,16 +557,15 @@ def _read_array_header(fp, version):
# Read an unsigned, little-endian short int which has the length of the
# header.
import struct
- if version == (1, 0):
- hlength_type = '<H'
- elif version == (2, 0):
- hlength_type = '<I'
- else:
- raise ValueError("Invalid version %r" % version)
+ hinfo = _header_size_info.get(version)
+ if hinfo is None:
+ raise ValueError("Invalid version {!r}".format(version))
+ hlength_type, encoding = hinfo
hlength_str = _read_bytes(fp, struct.calcsize(hlength_type), "array header length")
header_length = struct.unpack(hlength_type, hlength_str)[0]
header = _read_bytes(fp, header_length, "array header")
+ header = header.decode(encoding)
# The header is a pretty-printed string representation of a literal
# Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte
@@ -541,29 +577,29 @@ def _read_array_header(fp, version):
try:
d = safe_eval(header)
except SyntaxError as e:
- msg = "Cannot parse header: %r\nException: %r"
- raise ValueError(msg % (header, e))
+ msg = "Cannot parse header: {!r}\nException: {!r}"
+ raise ValueError(msg.format(header, e))
if not isinstance(d, dict):
- msg = "Header is not a dictionary: %r"
- raise ValueError(msg % d)
+ msg = "Header is not a dictionary: {!r}"
+ raise ValueError(msg.format(d))
keys = sorted(d.keys())
if keys != ['descr', 'fortran_order', 'shape']:
- msg = "Header does not contain the correct keys: %r"
- raise ValueError(msg % (keys,))
+ msg = "Header does not contain the correct keys: {!r}"
+ raise ValueError(msg.format(keys))
# Sanity-check the values.
if (not isinstance(d['shape'], tuple) or
not numpy.all([isinstance(x, (int, long)) for x in d['shape']])):
- msg = "shape is not valid: %r"
- raise ValueError(msg % (d['shape'],))
+ msg = "shape is not valid: {!r}"
+ raise ValueError(msg.format(d['shape']))
if not isinstance(d['fortran_order'], bool):
- msg = "fortran_order is not a valid bool: %r"
- raise ValueError(msg % (d['fortran_order'],))
+ msg = "fortran_order is not a valid bool: {!r}"
+ raise ValueError(msg.format(d['fortran_order']))
try:
dtype = descr_to_dtype(d['descr'])
except TypeError as e:
- msg = "descr is not a valid dtype descriptor: %r"
- raise ValueError(msg % (d['descr'],))
+ msg = "descr is not a valid dtype descriptor: {!r}"
+ raise ValueError(msg.format(d['descr']))
return d['shape'], d['fortran_order'], dtype
@@ -604,12 +640,7 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
"""
_check_version(version)
- used_ver = _write_array_header(fp, header_data_from_array_1_0(array),
- version)
- # this warning can be removed when 1.9 has aged enough
- if version != (2, 0) and used_ver == (2, 0):
- warnings.warn("Stored array in format 2.0. It can only be"
- "read by NumPy >= 1.9", UserWarning, stacklevel=2)
+ _write_array_header(fp, header_data_from_array_1_0(array), version)
if array.itemsize == 0:
buffersize = 0
@@ -619,14 +650,13 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
if array.dtype.hasobject:
# We contain Python objects so we cannot write out the data
- # directly. Instead, we will pickle it out with version 2 of the
- # pickle protocol.
+ # directly. Instead, we will pickle it out
if not allow_pickle:
raise ValueError("Object arrays cannot be saved when "
"allow_pickle=False")
if pickle_kwargs is None:
pickle_kwargs = {}
- pickle.dump(array, fp, protocol=2, **pickle_kwargs)
+ pickle.dump(array, fp, protocol=3, **pickle_kwargs)
elif array.flags.f_contiguous and not array.flags.c_contiguous:
if isfileobj(fp):
array.T.tofile(fp)
@@ -645,7 +675,7 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
fp.write(chunk.tobytes('C'))
-def read_array(fp, allow_pickle=True, pickle_kwargs=None):
+def read_array(fp, allow_pickle=False, pickle_kwargs=None):
"""
Read an array from an NPY file.
@@ -655,7 +685,11 @@ def read_array(fp, allow_pickle=True, pickle_kwargs=None):
If this is not a real file object, then this may take extra memory
and time.
allow_pickle : bool, optional
- Whether to allow reading pickled data. Default: True
+ Whether to allow writing pickled data. Default: False
+
+ .. versionchanged:: 1.16.3
+ Made default False in response to CVE-2019-6446.
+
pickle_kwargs : dict
Additional keyword arguments to pass to pickle.load. These are only
useful when loading object arrays saved on Python 2 when using
@@ -806,20 +840,12 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
shape=shape,
)
# If we got here, then it should be safe to create the file.
- fp = open(os_fspath(filename), mode+'b')
- try:
- used_ver = _write_array_header(fp, d, version)
- # this warning can be removed when 1.9 has aged enough
- if version != (2, 0) and used_ver == (2, 0):
- warnings.warn("Stored array in format 2.0. It can only be"
- "read by NumPy >= 1.9", UserWarning, stacklevel=2)
+ with open(os_fspath(filename), mode+'b') as fp:
+ _write_array_header(fp, d, version)
offset = fp.tell()
- finally:
- fp.close()
else:
# Read the header of the file first.
- fp = open(os_fspath(filename), 'rb')
- try:
+ with open(os_fspath(filename), 'rb') as fp:
version = read_magic(fp)
_check_version(version)
@@ -828,8 +854,6 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
msg = "Array can't be memory-mapped: Python objects in dtype."
raise ValueError(msg)
offset = fp.tell()
- finally:
- fp.close()
if fortran_order:
order = 'F'