diff options
author | scoder <stefan_ml@behnel.de> | 2023-05-04 09:29:53 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-04 09:29:53 +0200 |
commit | 442c8f48d3146ec32c7d5387310e171276cf10ac (patch) | |
tree | d8911d1a64e384b7955d3fc09a07edd218a9f1ee /numpy/lib/format.py | |
parent | 3e4a6cba2da27bbe2a6e12c163238e503c9f6a07 (diff) | |
parent | 9163e933df91b516b6f0c7a9ba8ad1750e642f37 (diff) | |
download | numpy-442c8f48d3146ec32c7d5387310e171276cf10ac.tar.gz |
Merge branch 'main' into cython3_noexcept
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r-- | numpy/lib/format.py | 132 |
1 files changed, 100 insertions, 32 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 4e6e731c1..ef50fb19d 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -41,10 +41,10 @@ Capabilities - Is straightforward to reverse engineer. Datasets often live longer than the programs that created them. A competent developer should be able to create a solution in their preferred programming language to - read most ``.npy`` files that he has been given without much + read most ``.npy`` files that they have been given without much documentation. -- Allows memory-mapping of the data. See `open_memmep`. +- Allows memory-mapping of the data. See `open_memmap`. - Can be read from a filelike stream object instead of an actual file. @@ -162,7 +162,6 @@ evolved with time and this document is more current. """ import numpy -import io import warnings from numpy.lib.utils import safe_eval from numpy.compat import ( @@ -173,10 +172,13 @@ from numpy.compat import ( __all__ = [] +EXPECTED_KEYS = {'descr', 'fortran_order', 'shape'} MAGIC_PREFIX = b'\x93NUMPY' MAGIC_LEN = len(MAGIC_PREFIX) + 2 ARRAY_ALIGN = 64 # plausible values are powers of 2 between 16 and 4096 BUFFER_SIZE = 2**18 # size of buffer for reading npz files in bytes +# allow growth within the address space of a 64 bit machine along one axis +GROWTH_AXIS_MAX_DIGITS = 21 # = len(str(8*2**64-1)) hypothetical int1 dtype # difference between version 1.0 and 2.0 is a 4 byte (I) header length # instead of 2 bytes (H) allowing storage of large structured arrays @@ -186,6 +188,10 @@ _header_size_info = { (3, 0): ('<I', 'utf8'), } +# Python's literal_eval is not actually safe for large inputs, since parsing +# may become slow or even cause interpreter crashes. +# This is an arbitrary, low limit which should make it safe in practice. +_MAX_HEADER_SIZE = 10000 def _check_version(version): if version not in [(1, 0), (2, 0), (3, 0), None]: @@ -291,7 +297,7 @@ def descr_to_dtype(descr): Parameters ---------- descr : object - The object retreived by dtype.descr. Can be passed to + The object retrieved by dtype.descr. Can be passed to `numpy.dtype()` in order to replicate the input dtype. Returns @@ -370,15 +376,14 @@ def _wrap_header(header, version): import struct assert version is not None fmt, encoding = _header_size_info[version] - if not isinstance(header, bytes): # always true on python 3 - header = header.encode(encoding) + header = header.encode(encoding) hlen = len(header) + 1 padlen = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize(fmt) + hlen) % ARRAY_ALIGN) try: header_prefix = magic(*version) + struct.pack(fmt, hlen + padlen) except struct.error: msg = "Header length {} too big for version={}".format(hlen, version) - raise ValueError(msg) + raise ValueError(msg) from None # Pad the header with spaces and a final newline such that the magic # string, the header-length short and the header are aligned on a @@ -421,10 +426,10 @@ def _write_array_header(fp, d, version=None): d : dict This has the appropriate entries for writing its string representation to the header of the file. - version: tuple or None - None means use oldest that works - explicit version will raise a ValueError if the format does not - allow saving this data. Default: None + version : tuple or None + None means use oldest that works. Providing an explicit version will + raise a ValueError if the format does not allow saving this data. + Default: None """ header = ["{"] for key, value in sorted(d.items()): @@ -432,7 +437,15 @@ def _write_array_header(fp, d, version=None): header.append("'%s': %s, " % (key, repr(value))) header.append("}") header = "".join(header) - header = _filter_header(header) + + # Add some spare space so that the array header can be modified in-place + # when changing the array size, e.g. when growing it by appending data at + # the end. + shape = d['shape'] + header += " " * ((GROWTH_AXIS_MAX_DIGITS - len(repr( + shape[-1 if d['fortran_order'] else 0] + ))) if len(shape) > 0 else 0) + if version is None: header = _wrap_header_guess_version(header) else: @@ -467,7 +480,7 @@ def write_array_header_2_0(fp, d): """ _write_array_header(fp, d, (2, 0)) -def read_array_header_1_0(fp): +def read_array_header_1_0(fp, max_header_size=_MAX_HEADER_SIZE): """ Read an array header from a filelike object using the 1.0 file format version. @@ -489,6 +502,10 @@ def read_array_header_1_0(fp): contiguous before writing it out. dtype : dtype The dtype of the file's data. + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:func:`ast.literal_eval()` for details. Raises ------ @@ -496,9 +513,10 @@ def read_array_header_1_0(fp): If the data is invalid. """ - return _read_array_header(fp, version=(1, 0)) + return _read_array_header( + fp, version=(1, 0), max_header_size=max_header_size) -def read_array_header_2_0(fp): +def read_array_header_2_0(fp, max_header_size=_MAX_HEADER_SIZE): """ Read an array header from a filelike object using the 2.0 file format version. @@ -511,6 +529,10 @@ def read_array_header_2_0(fp): ---------- fp : filelike object A file object or something with a `.read()` method like a file. + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:func:`ast.literal_eval()` for details. Returns ------- @@ -529,7 +551,8 @@ def read_array_header_2_0(fp): If the data is invalid. """ - return _read_array_header(fp, version=(2, 0)) + return _read_array_header( + fp, version=(2, 0), max_header_size=max_header_size) def _filter_header(s): @@ -567,7 +590,7 @@ def _filter_header(s): return tokenize.untokenize(tokens) -def _read_array_header(fp, version): +def _read_array_header(fp, version, max_header_size=_MAX_HEADER_SIZE): """ see read_array_header_1_0 """ @@ -583,6 +606,14 @@ def _read_array_header(fp, version): header_length = struct.unpack(hlength_type, hlength_str)[0] header = _read_bytes(fp, header_length, "array header") header = header.decode(encoding) + if len(header) > max_header_size: + raise ValueError( + f"Header info length ({len(header)}) is large and may not be safe " + "to load securely.\n" + "To allow loading, adjust `max_header_size` or fully trust " + "the `.npy` file using `allow_pickle=True`.\n" + "For safety against large resource use or crashes, sandboxing " + "may be necessary.") # The header is a pretty-printed string representation of a literal # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte @@ -590,23 +621,41 @@ def _read_array_header(fp, version): # "shape" : tuple of int # "fortran_order" : bool # "descr" : dtype.descr - header = _filter_header(header) + # Versions (2, 0) and (1, 0) could have been created by a Python 2 + # implementation before header filtering was implemented. + # + # For performance reasons, we try without _filter_header first though try: d = safe_eval(header) except SyntaxError as e: - msg = "Cannot parse header: {!r}\nException: {!r}" - raise ValueError(msg.format(header, e)) + if version <= (2, 0): + header = _filter_header(header) + try: + d = safe_eval(header) + except SyntaxError as e2: + msg = "Cannot parse header: {!r}" + raise ValueError(msg.format(header)) from e2 + else: + warnings.warn( + "Reading `.npy` or `.npz` file required additional " + "header parsing as it was created on Python 2. Save the " + "file again to speed up loading and avoid this warning.", + UserWarning, stacklevel=4) + else: + msg = "Cannot parse header: {!r}" + raise ValueError(msg.format(header)) from e if not isinstance(d, dict): msg = "Header is not a dictionary: {!r}" raise ValueError(msg.format(d)) - keys = sorted(d.keys()) - if keys != ['descr', 'fortran_order', 'shape']: + + if EXPECTED_KEYS != d.keys(): + keys = sorted(d.keys()) msg = "Header does not contain the correct keys: {!r}" raise ValueError(msg.format(keys)) # Sanity-check the values. if (not isinstance(d['shape'], tuple) or - not numpy.all([isinstance(x, int) for x in d['shape']])): + not all(isinstance(x, int) for x in d['shape'])): msg = "shape is not valid: {!r}" raise ValueError(msg.format(d['shape'])) if not isinstance(d['fortran_order'], bool): @@ -614,9 +663,9 @@ def _read_array_header(fp, version): raise ValueError(msg.format(d['fortran_order'])) try: dtype = descr_to_dtype(d['descr']) - except TypeError: + except TypeError as e: msg = "descr is not a valid dtype descriptor: {!r}" - raise ValueError(msg.format(d['descr'])) + raise ValueError(msg.format(d['descr'])) from e return d['shape'], d['fortran_order'], dtype @@ -692,7 +741,8 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): fp.write(chunk.tobytes('C')) -def read_array(fp, allow_pickle=False, pickle_kwargs=None): +def read_array(fp, allow_pickle=False, pickle_kwargs=None, *, + max_header_size=_MAX_HEADER_SIZE): """ Read an array from an NPY file. @@ -711,6 +761,12 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): Additional keyword arguments to pass to pickle.load. These are only useful when loading object arrays saved on Python 2 when using Python 3. + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:func:`ast.literal_eval()` for details. + This option is ignored when `allow_pickle` is passed. In that case + the file is by definition trusted and the limit is unnecessary. Returns ------- @@ -724,9 +780,15 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): an object array. """ + if allow_pickle: + # Effectively ignore max_header_size, since `allow_pickle` indicates + # that the input is fully trusted. + max_header_size = 2**64 + version = read_magic(fp) _check_version(version) - shape, fortran_order, dtype = _read_array_header(fp, version) + shape, fortran_order, dtype = _read_array_header( + fp, version, max_header_size=max_header_size) if len(shape) == 0: count = 1 else: @@ -746,7 +808,7 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): # Friendlier error message raise UnicodeError("Unpickling a python object failed: %r\n" "You may need to pass the encoding= option " - "to numpy.load" % (err,)) + "to numpy.load" % (err,)) from err else: if isfileobj(fp): # We can use the fast fromfile() function. @@ -786,7 +848,8 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None): def open_memmap(filename, mode='r+', dtype=None, shape=None, - fortran_order=False, version=None): + fortran_order=False, version=None, *, + max_header_size=_MAX_HEADER_SIZE): """ Open a .npy file as a memory-mapped array. @@ -817,6 +880,10 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, If the mode is a "write" mode, then this is the version of the file format used to create the file. None means use the oldest supported version that is able to store the data. Default: None + max_header_size : int, optional + Maximum allowed size of the header. Large headers may not be safe + to load securely and thus require explicitly passing a larger value. + See :py:func:`ast.literal_eval()` for details. Returns ------- @@ -827,7 +894,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, ------ ValueError If the data or the mode is invalid. - IOError + OSError If the file is not found or cannot be opened correctly. See Also @@ -864,7 +931,8 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, version = read_magic(fp) _check_version(version) - shape, fortran_order, dtype = _read_array_header(fp, version) + shape, fortran_order, dtype = _read_array_header( + fp, version, max_header_size=max_header_size) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) @@ -905,7 +973,7 @@ def _read_bytes(fp, size, error_template="ran out of data"): data += r if len(r) == 0 or len(data) == size: break - except io.BlockingIOError: + except BlockingIOError: pass if len(data) != size: msg = "EOF: reading %s, expected %d bytes got %d" |