diff options
author | Eric Wieser <wieser.eric@gmail.com> | 2019-08-19 19:16:44 -0500 |
---|---|---|
committer | Eric Wieser <wieser.eric@gmail.com> | 2019-08-19 19:16:44 -0500 |
commit | 0f5e376d3eb6118b783cdd3ecd27722c2d1934ba (patch) | |
tree | c44850b579cbd27993c45dda1a7922e2d109b24f /numpy/lib/format.py | |
parent | 483f565d85dadc899f94710531fba8355d554d59 (diff) | |
parent | 98bdde643af6443d68a8c6233807b75bd3f0ed80 (diff) | |
download | numpy-0f5e376d3eb6118b783cdd3ecd27722c2d1934ba.tar.gz |
Merge remote-tracking branch 'upstream/master' into fix-if-fields
Diffstat (limited to 'numpy/lib/format.py')
-rw-r--r-- | numpy/lib/format.py | 196 |
1 files changed, 110 insertions, 86 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 10945e5e8..3bf818812 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -146,10 +146,17 @@ The description of the fourth element of the header therefore has become: "The next 4 bytes form a little-endian unsigned int: the length of the header data HEADER_LEN." +Format Version 3.0 +------------------ + +This version replaces the ASCII string (which in practice was latin1) with +a utf8-encoded string, so supports structured types with any unicode field +names. + Notes ----- The ``.npy`` format, including motivation for creating it and a comparison of -alternatives, is described in the `"npy-format" NEP +alternatives, is described in the `"npy-format" NEP <https://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have evolved with time and this document is more current. @@ -162,9 +169,8 @@ import io import warnings from numpy.lib.utils import safe_eval from numpy.compat import ( - asbytes, asstr, isfileobj, long, os_fspath + isfileobj, long, os_fspath, pickle ) -from numpy.core.numeric import pickle MAGIC_PREFIX = b'\x93NUMPY' @@ -174,10 +180,16 @@ BUFFER_SIZE = 2**18 # size of buffer for reading npz files in bytes # difference between version 1.0 and 2.0 is a 4 byte (I) header length # instead of 2 bytes (H) allowing storage of large structured arrays +_header_size_info = { + (1, 0): ('<H', 'latin1'), + (2, 0): ('<I', 'latin1'), + (3, 0): ('<I', 'utf8'), +} + def _check_version(version): - if version not in [(1, 0), (2, 0), None]: - msg = "we only support format version (1,0) and (2, 0), not %s" + if version not in [(1, 0), (2, 0), (3, 0), None]: + msg = "we only support format version (1,0), (2,0), and (3,0), not %s" raise ValueError(msg % (version,)) def magic(major, minor): @@ -262,15 +274,19 @@ def dtype_to_descr(dtype): def descr_to_dtype(descr): ''' descr may be stored as dtype.descr, which is a list of - (name, format, [shape]) tuples. Offsets are not explicitly saved, rather - empty fields with name,format == '', '|Vn' are added as padding. + (name, format, [shape]) tuples where format may be a str or a tuple. + Offsets are not explicitly saved, rather empty fields with + name, format == '', '|Vn' are added as padding. This function reverses the process, eliminating the empty padding fields. ''' - if isinstance(descr, (str, dict)): + if isinstance(descr, str): # No padding removal needed return numpy.dtype(descr) - + elif isinstance(descr, tuple): + # subtype, will always have a shape descr[1] + dt = descr_to_dtype(descr[0]) + return numpy.dtype((dt, descr[1])) fields = [] offset = 0 for field in descr: @@ -323,6 +339,56 @@ def header_data_from_array_1_0(array): d['descr'] = dtype_to_descr(array.dtype) return d + +def _wrap_header(header, version): + """ + Takes a stringified header, and attaches the prefix and padding to it + """ + import struct + assert version is not None + fmt, encoding = _header_size_info[version] + if not isinstance(header, bytes): # always true on python 3 + header = header.encode(encoding) + hlen = len(header) + 1 + padlen = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize(fmt) + hlen) % ARRAY_ALIGN) + try: + header_prefix = magic(*version) + struct.pack(fmt, hlen + padlen) + except struct.error: + msg = "Header length {} too big for version={}".format(hlen, version) + raise ValueError(msg) + + # Pad the header with spaces and a final newline such that the magic + # string, the header-length short and the header are aligned on a + # ARRAY_ALIGN byte boundary. This supports memory mapping of dtypes + # aligned up to ARRAY_ALIGN on systems like Linux where mmap() + # offset must be page-aligned (i.e. the beginning of the file). + return header_prefix + header + b' '*padlen + b'\n' + + +def _wrap_header_guess_version(header): + """ + Like `_wrap_header`, but chooses an appropriate version given the contents + """ + try: + return _wrap_header(header, (1, 0)) + except ValueError: + pass + + try: + ret = _wrap_header(header, (2, 0)) + except UnicodeEncodeError: + pass + else: + warnings.warn("Stored array in format 2.0. It can only be" + "read by NumPy >= 1.9", UserWarning, stacklevel=2) + return ret + + header = _wrap_header(header, (3, 0)) + warnings.warn("Stored array in format 3.0. It can only be " + "read by NumPy >= 1.17", UserWarning, stacklevel=2) + return header + + def _write_array_header(fp, d, version=None): """ Write the header for an array and returns the version used @@ -336,48 +402,19 @@ def _write_array_header(fp, d, version=None): None means use oldest that works explicit version will raise a ValueError if the format does not allow saving this data. Default: None - Returns - ------- - version : tuple of int - the file version which needs to be used to store the data """ - import struct header = ["{"] for key, value in sorted(d.items()): # Need to use repr here, since we eval these when reading header.append("'%s': %s, " % (key, repr(value))) header.append("}") header = "".join(header) - header = asbytes(_filter_header(header)) - - hlen = len(header) + 1 # 1 for newline - padlen_v1 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<H') + hlen) % ARRAY_ALIGN) - padlen_v2 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<I') + hlen) % ARRAY_ALIGN) - - # Which version(s) we write depends on the total header size; v1 has a max of 65535 - if hlen + padlen_v1 < 2**16 and version in (None, (1, 0)): - version = (1, 0) - header_prefix = magic(1, 0) + struct.pack('<H', hlen + padlen_v1) - topad = padlen_v1 - elif hlen + padlen_v2 < 2**32 and version in (None, (2, 0)): - version = (2, 0) - header_prefix = magic(2, 0) + struct.pack('<I', hlen + padlen_v2) - topad = padlen_v2 + header = _filter_header(header) + if version is None: + header = _wrap_header_guess_version(header) else: - msg = "Header length %s too big for version=%s" - msg %= (hlen, version) - raise ValueError(msg) - - # Pad the header with spaces and a final newline such that the magic - # string, the header-length short and the header are aligned on a - # ARRAY_ALIGN byte boundary. This supports memory mapping of dtypes - # aligned up to ARRAY_ALIGN on systems like Linux where mmap() - # offset must be page-aligned (i.e. the beginning of the file). - header = header + b' '*topad + b'\n' - - fp.write(header_prefix) + header = _wrap_header(header, version) fp.write(header) - return version def write_array_header_1_0(fp, d): """ Write the header for an array using the 1.0 format. @@ -480,7 +517,7 @@ def _filter_header(s): Parameters ---------- - s : byte string + s : string Npy file header. Returns @@ -498,7 +535,7 @@ def _filter_header(s): tokens = [] last_token_was_number = False # adding newline as python 2.7.5 workaround - string = asstr(s) + "\n" + string = s + "\n" for token in tokenize.generate_tokens(StringIO(string).readline): token_type = token[0] token_string = token[1] @@ -520,16 +557,15 @@ def _read_array_header(fp, version): # Read an unsigned, little-endian short int which has the length of the # header. import struct - if version == (1, 0): - hlength_type = '<H' - elif version == (2, 0): - hlength_type = '<I' - else: - raise ValueError("Invalid version %r" % version) + hinfo = _header_size_info.get(version) + if hinfo is None: + raise ValueError("Invalid version {!r}".format(version)) + hlength_type, encoding = hinfo hlength_str = _read_bytes(fp, struct.calcsize(hlength_type), "array header length") header_length = struct.unpack(hlength_type, hlength_str)[0] header = _read_bytes(fp, header_length, "array header") + header = header.decode(encoding) # The header is a pretty-printed string representation of a literal # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte @@ -541,29 +577,29 @@ def _read_array_header(fp, version): try: d = safe_eval(header) except SyntaxError as e: - msg = "Cannot parse header: %r\nException: %r" - raise ValueError(msg % (header, e)) + msg = "Cannot parse header: {!r}\nException: {!r}" + raise ValueError(msg.format(header, e)) if not isinstance(d, dict): - msg = "Header is not a dictionary: %r" - raise ValueError(msg % d) + msg = "Header is not a dictionary: {!r}" + raise ValueError(msg.format(d)) keys = sorted(d.keys()) if keys != ['descr', 'fortran_order', 'shape']: - msg = "Header does not contain the correct keys: %r" - raise ValueError(msg % (keys,)) + msg = "Header does not contain the correct keys: {!r}" + raise ValueError(msg.format(keys)) # Sanity-check the values. if (not isinstance(d['shape'], tuple) or not numpy.all([isinstance(x, (int, long)) for x in d['shape']])): - msg = "shape is not valid: %r" - raise ValueError(msg % (d['shape'],)) + msg = "shape is not valid: {!r}" + raise ValueError(msg.format(d['shape'])) if not isinstance(d['fortran_order'], bool): - msg = "fortran_order is not a valid bool: %r" - raise ValueError(msg % (d['fortran_order'],)) + msg = "fortran_order is not a valid bool: {!r}" + raise ValueError(msg.format(d['fortran_order'])) try: dtype = descr_to_dtype(d['descr']) except TypeError as e: - msg = "descr is not a valid dtype descriptor: %r" - raise ValueError(msg % (d['descr'],)) + msg = "descr is not a valid dtype descriptor: {!r}" + raise ValueError(msg.format(d['descr'])) return d['shape'], d['fortran_order'], dtype @@ -604,12 +640,7 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): """ _check_version(version) - used_ver = _write_array_header(fp, header_data_from_array_1_0(array), - version) - # this warning can be removed when 1.9 has aged enough - if version != (2, 0) and used_ver == (2, 0): - warnings.warn("Stored array in format 2.0. It can only be" - "read by NumPy >= 1.9", UserWarning, stacklevel=2) + _write_array_header(fp, header_data_from_array_1_0(array), version) if array.itemsize == 0: buffersize = 0 @@ -619,14 +650,13 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): if array.dtype.hasobject: # We contain Python objects so we cannot write out the data - # directly. Instead, we will pickle it out with version 2 of the - # pickle protocol. + # directly. Instead, we will pickle it out if not allow_pickle: raise ValueError("Object arrays cannot be saved when " "allow_pickle=False") if pickle_kwargs is None: pickle_kwargs = {} - pickle.dump(array, fp, protocol=2, **pickle_kwargs) + pickle.dump(array, fp, protocol=3, **pickle_kwargs) elif array.flags.f_contiguous and not array.flags.c_contiguous: if isfileobj(fp): array.T.tofile(fp) @@ -645,7 +675,7 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): fp.write(chunk.tobytes('C')) -def read_array(fp, allow_pickle=True, pickle_kwargs=None): +def read_array(fp, allow_pickle=False, pickle_kwargs=None): """ Read an array from an NPY file. @@ -655,7 +685,11 @@ def read_array(fp, allow_pickle=True, pickle_kwargs=None): If this is not a real file object, then this may take extra memory and time. allow_pickle : bool, optional - Whether to allow reading pickled data. Default: True + Whether to allow writing pickled data. Default: False + + .. versionchanged:: 1.16.3 + Made default False in response to CVE-2019-6446. + pickle_kwargs : dict Additional keyword arguments to pass to pickle.load. These are only useful when loading object arrays saved on Python 2 when using @@ -806,20 +840,12 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, shape=shape, ) # If we got here, then it should be safe to create the file. - fp = open(os_fspath(filename), mode+'b') - try: - used_ver = _write_array_header(fp, d, version) - # this warning can be removed when 1.9 has aged enough - if version != (2, 0) and used_ver == (2, 0): - warnings.warn("Stored array in format 2.0. It can only be" - "read by NumPy >= 1.9", UserWarning, stacklevel=2) + with open(os_fspath(filename), mode+'b') as fp: + _write_array_header(fp, d, version) offset = fp.tell() - finally: - fp.close() else: # Read the header of the file first. - fp = open(os_fspath(filename), 'rb') - try: + with open(os_fspath(filename), 'rb') as fp: version = read_magic(fp) _check_version(version) @@ -828,8 +854,6 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None, msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) offset = fp.tell() - finally: - fp.close() if fortran_order: order = 'F' |