summaryrefslogtreecommitdiff
path: root/numpy/lib/format.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r--numpy/lib/format.py132
1 files changed, 100 insertions, 32 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 4e6e731c1..ef50fb19d 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -41,10 +41,10 @@ Capabilities
- Is straightforward to reverse engineer. Datasets often live longer than
the programs that created them. A competent developer should be
able to create a solution in their preferred programming language to
- read most ``.npy`` files that he has been given without much
+ read most ``.npy`` files that they have been given without much
documentation.
-- Allows memory-mapping of the data. See `open_memmep`.
+- Allows memory-mapping of the data. See `open_memmap`.
- Can be read from a filelike stream object instead of an actual file.
@@ -162,7 +162,6 @@ evolved with time and this document is more current.
"""
import numpy
-import io
import warnings
from numpy.lib.utils import safe_eval
from numpy.compat import (
@@ -173,10 +172,13 @@ from numpy.compat import (
__all__ = []
+EXPECTED_KEYS = {'descr', 'fortran_order', 'shape'}
MAGIC_PREFIX = b'\x93NUMPY'
MAGIC_LEN = len(MAGIC_PREFIX) + 2
ARRAY_ALIGN = 64 # plausible values are powers of 2 between 16 and 4096
BUFFER_SIZE = 2**18 # size of buffer for reading npz files in bytes
+# allow growth within the address space of a 64 bit machine along one axis
+GROWTH_AXIS_MAX_DIGITS = 21 # = len(str(8*2**64-1)) hypothetical int1 dtype
# difference between version 1.0 and 2.0 is a 4 byte (I) header length
# instead of 2 bytes (H) allowing storage of large structured arrays
@@ -186,6 +188,10 @@ _header_size_info = {
(3, 0): ('<I', 'utf8'),
}
+# Python's literal_eval is not actually safe for large inputs, since parsing
+# may become slow or even cause interpreter crashes.
+# This is an arbitrary, low limit which should make it safe in practice.
+_MAX_HEADER_SIZE = 10000
def _check_version(version):
if version not in [(1, 0), (2, 0), (3, 0), None]:
@@ -291,7 +297,7 @@ def descr_to_dtype(descr):
Parameters
----------
descr : object
- The object retreived by dtype.descr. Can be passed to
+ The object retrieved by dtype.descr. Can be passed to
`numpy.dtype()` in order to replicate the input dtype.
Returns
@@ -370,15 +376,14 @@ def _wrap_header(header, version):
import struct
assert version is not None
fmt, encoding = _header_size_info[version]
- if not isinstance(header, bytes): # always true on python 3
- header = header.encode(encoding)
+ header = header.encode(encoding)
hlen = len(header) + 1
padlen = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize(fmt) + hlen) % ARRAY_ALIGN)
try:
header_prefix = magic(*version) + struct.pack(fmt, hlen + padlen)
except struct.error:
msg = "Header length {} too big for version={}".format(hlen, version)
- raise ValueError(msg)
+ raise ValueError(msg) from None
# Pad the header with spaces and a final newline such that the magic
# string, the header-length short and the header are aligned on a
@@ -421,10 +426,10 @@ def _write_array_header(fp, d, version=None):
d : dict
This has the appropriate entries for writing its string representation
to the header of the file.
- version: tuple or None
- None means use oldest that works
- explicit version will raise a ValueError if the format does not
- allow saving this data. Default: None
+ version : tuple or None
+ None means use oldest that works. Providing an explicit version will
+ raise a ValueError if the format does not allow saving this data.
+ Default: None
"""
header = ["{"]
for key, value in sorted(d.items()):
@@ -432,7 +437,15 @@ def _write_array_header(fp, d, version=None):
header.append("'%s': %s, " % (key, repr(value)))
header.append("}")
header = "".join(header)
- header = _filter_header(header)
+
+ # Add some spare space so that the array header can be modified in-place
+ # when changing the array size, e.g. when growing it by appending data at
+ # the end.
+ shape = d['shape']
+ header += " " * ((GROWTH_AXIS_MAX_DIGITS - len(repr(
+ shape[-1 if d['fortran_order'] else 0]
+ ))) if len(shape) > 0 else 0)
+
if version is None:
header = _wrap_header_guess_version(header)
else:
@@ -467,7 +480,7 @@ def write_array_header_2_0(fp, d):
"""
_write_array_header(fp, d, (2, 0))
-def read_array_header_1_0(fp):
+def read_array_header_1_0(fp, max_header_size=_MAX_HEADER_SIZE):
"""
Read an array header from a filelike object using the 1.0 file format
version.
@@ -489,6 +502,10 @@ def read_array_header_1_0(fp):
contiguous before writing it out.
dtype : dtype
The dtype of the file's data.
+ max_header_size : int, optional
+ Maximum allowed size of the header. Large headers may not be safe
+ to load securely and thus require explicitly passing a larger value.
+ See :py:func:`ast.literal_eval()` for details.
Raises
------
@@ -496,9 +513,10 @@ def read_array_header_1_0(fp):
If the data is invalid.
"""
- return _read_array_header(fp, version=(1, 0))
+ return _read_array_header(
+ fp, version=(1, 0), max_header_size=max_header_size)
-def read_array_header_2_0(fp):
+def read_array_header_2_0(fp, max_header_size=_MAX_HEADER_SIZE):
"""
Read an array header from a filelike object using the 2.0 file format
version.
@@ -511,6 +529,10 @@ def read_array_header_2_0(fp):
----------
fp : filelike object
A file object or something with a `.read()` method like a file.
+ max_header_size : int, optional
+ Maximum allowed size of the header. Large headers may not be safe
+ to load securely and thus require explicitly passing a larger value.
+ See :py:func:`ast.literal_eval()` for details.
Returns
-------
@@ -529,7 +551,8 @@ def read_array_header_2_0(fp):
If the data is invalid.
"""
- return _read_array_header(fp, version=(2, 0))
+ return _read_array_header(
+ fp, version=(2, 0), max_header_size=max_header_size)
def _filter_header(s):
@@ -567,7 +590,7 @@ def _filter_header(s):
return tokenize.untokenize(tokens)
-def _read_array_header(fp, version):
+def _read_array_header(fp, version, max_header_size=_MAX_HEADER_SIZE):
"""
see read_array_header_1_0
"""
@@ -583,6 +606,14 @@ def _read_array_header(fp, version):
header_length = struct.unpack(hlength_type, hlength_str)[0]
header = _read_bytes(fp, header_length, "array header")
header = header.decode(encoding)
+ if len(header) > max_header_size:
+ raise ValueError(
+ f"Header info length ({len(header)}) is large and may not be safe "
+ "to load securely.\n"
+ "To allow loading, adjust `max_header_size` or fully trust "
+ "the `.npy` file using `allow_pickle=True`.\n"
+ "For safety against large resource use or crashes, sandboxing "
+ "may be necessary.")
# The header is a pretty-printed string representation of a literal
# Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte
@@ -590,23 +621,41 @@ def _read_array_header(fp, version):
# "shape" : tuple of int
# "fortran_order" : bool
# "descr" : dtype.descr
- header = _filter_header(header)
+ # Versions (2, 0) and (1, 0) could have been created by a Python 2
+ # implementation before header filtering was implemented.
+ #
+ # For performance reasons, we try without _filter_header first though
try:
d = safe_eval(header)
except SyntaxError as e:
- msg = "Cannot parse header: {!r}\nException: {!r}"
- raise ValueError(msg.format(header, e))
+ if version <= (2, 0):
+ header = _filter_header(header)
+ try:
+ d = safe_eval(header)
+ except SyntaxError as e2:
+ msg = "Cannot parse header: {!r}"
+ raise ValueError(msg.format(header)) from e2
+ else:
+ warnings.warn(
+ "Reading `.npy` or `.npz` file required additional "
+ "header parsing as it was created on Python 2. Save the "
+ "file again to speed up loading and avoid this warning.",
+ UserWarning, stacklevel=4)
+ else:
+ msg = "Cannot parse header: {!r}"
+ raise ValueError(msg.format(header)) from e
if not isinstance(d, dict):
msg = "Header is not a dictionary: {!r}"
raise ValueError(msg.format(d))
- keys = sorted(d.keys())
- if keys != ['descr', 'fortran_order', 'shape']:
+
+ if EXPECTED_KEYS != d.keys():
+ keys = sorted(d.keys())
msg = "Header does not contain the correct keys: {!r}"
raise ValueError(msg.format(keys))
# Sanity-check the values.
if (not isinstance(d['shape'], tuple) or
- not numpy.all([isinstance(x, int) for x in d['shape']])):
+ not all(isinstance(x, int) for x in d['shape'])):
msg = "shape is not valid: {!r}"
raise ValueError(msg.format(d['shape']))
if not isinstance(d['fortran_order'], bool):
@@ -614,9 +663,9 @@ def _read_array_header(fp, version):
raise ValueError(msg.format(d['fortran_order']))
try:
dtype = descr_to_dtype(d['descr'])
- except TypeError:
+ except TypeError as e:
msg = "descr is not a valid dtype descriptor: {!r}"
- raise ValueError(msg.format(d['descr']))
+ raise ValueError(msg.format(d['descr'])) from e
return d['shape'], d['fortran_order'], dtype
@@ -692,7 +741,8 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
fp.write(chunk.tobytes('C'))
-def read_array(fp, allow_pickle=False, pickle_kwargs=None):
+def read_array(fp, allow_pickle=False, pickle_kwargs=None, *,
+ max_header_size=_MAX_HEADER_SIZE):
"""
Read an array from an NPY file.
@@ -711,6 +761,12 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
Additional keyword arguments to pass to pickle.load. These are only
useful when loading object arrays saved on Python 2 when using
Python 3.
+ max_header_size : int, optional
+ Maximum allowed size of the header. Large headers may not be safe
+ to load securely and thus require explicitly passing a larger value.
+ See :py:func:`ast.literal_eval()` for details.
+ This option is ignored when `allow_pickle` is passed. In that case
+ the file is by definition trusted and the limit is unnecessary.
Returns
-------
@@ -724,9 +780,15 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
an object array.
"""
+ if allow_pickle:
+ # Effectively ignore max_header_size, since `allow_pickle` indicates
+ # that the input is fully trusted.
+ max_header_size = 2**64
+
version = read_magic(fp)
_check_version(version)
- shape, fortran_order, dtype = _read_array_header(fp, version)
+ shape, fortran_order, dtype = _read_array_header(
+ fp, version, max_header_size=max_header_size)
if len(shape) == 0:
count = 1
else:
@@ -746,7 +808,7 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
# Friendlier error message
raise UnicodeError("Unpickling a python object failed: %r\n"
"You may need to pass the encoding= option "
- "to numpy.load" % (err,))
+ "to numpy.load" % (err,)) from err
else:
if isfileobj(fp):
# We can use the fast fromfile() function.
@@ -786,7 +848,8 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
def open_memmap(filename, mode='r+', dtype=None, shape=None,
- fortran_order=False, version=None):
+ fortran_order=False, version=None, *,
+ max_header_size=_MAX_HEADER_SIZE):
"""
Open a .npy file as a memory-mapped array.
@@ -817,6 +880,10 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
If the mode is a "write" mode, then this is the version of the file
format used to create the file. None means use the oldest
supported version that is able to store the data. Default: None
+ max_header_size : int, optional
+ Maximum allowed size of the header. Large headers may not be safe
+ to load securely and thus require explicitly passing a larger value.
+ See :py:func:`ast.literal_eval()` for details.
Returns
-------
@@ -827,7 +894,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
------
ValueError
If the data or the mode is invalid.
- IOError
+ OSError
If the file is not found or cannot be opened correctly.
See Also
@@ -864,7 +931,8 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
version = read_magic(fp)
_check_version(version)
- shape, fortran_order, dtype = _read_array_header(fp, version)
+ shape, fortran_order, dtype = _read_array_header(
+ fp, version, max_header_size=max_header_size)
if dtype.hasobject:
msg = "Array can't be memory-mapped: Python objects in dtype."
raise ValueError(msg)
@@ -905,7 +973,7 @@ def _read_bytes(fp, size, error_template="ran out of data"):
data += r
if len(r) == 0 or len(data) == size:
break
- except io.BlockingIOError:
+ except BlockingIOError:
pass
if len(data) != size:
msg = "EOF: reading %s, expected %d bytes got %d"