diff options
-rw-r--r-- | doc/release/1.17.0-notes.rst | 5 | ||||
-rw-r--r-- | numpy/lib/format.py | 51 | ||||
-rw-r--r-- | numpy/lib/tests/test_format.py | 25 |
3 files changed, 66 insertions, 15 deletions
diff --git a/doc/release/1.17.0-notes.rst b/doc/release/1.17.0-notes.rst index 71ad17673..b18c0241c 100644 --- a/doc/release/1.17.0-notes.rst +++ b/doc/release/1.17.0-notes.rst @@ -149,6 +149,11 @@ Floating point scalars implement ``as_integer_ratio`` to match the builtin float This returns a (numerator, denominator) pair, which can be used to construct a `fractions.Fraction`. +``.npy`` files support unicode field names +------------------------------------------ +A new format version of 3.0 has been introduced, which enables structured types +with non-latin1 field names. This is used automatically when needed. + Improvements ============ diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 271bc4a19..7ede0031f 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -146,6 +146,13 @@ The description of the fourth element of the header therefore has become: "The next 4 bytes form a little-endian unsigned int: the length of the header data HEADER_LEN." +Format Version 3.0 +------------------ + +This version replaces the ASCII string (which in practice was latin1) with +a utf8-encoded string, so supports structured types with any unicode field +names. + Notes ----- The ``.npy`` format, including motivation for creating it and a comparison of @@ -162,7 +169,7 @@ import io import warnings from numpy.lib.utils import safe_eval from numpy.compat import ( - asbytes, asstr, isfileobj, long, os_fspath, pickle + isfileobj, long, os_fspath, pickle ) @@ -173,15 +180,16 @@ BUFFER_SIZE = 2**18 # size of buffer for reading npz files in bytes # difference between version 1.0 and 2.0 is a 4 byte (I) header length # instead of 2 bytes (H) allowing storage of large structured arrays -_header_size_formats = { - (1, 0): '<H', - (2, 0): '<I', +_header_size_info = { + (1, 0): ('<H', 'latin1'), + (2, 0): ('<I', 'latin1'), + (3, 0): ('<I', 'utf8'), } def _check_version(version): - if version not in [(1, 0), (2, 0), None]: - msg = "we only support format version (1,0) and (2, 0), not %s" + if version not in [(1, 0), (2, 0), (3, 0), None]: + msg = "we only support format version (1,0), (2,0), and (3,0), not %s" raise ValueError(msg % (version,)) def magic(major, minor): @@ -334,8 +342,9 @@ def _wrap_header(header, version): """ import struct assert version is not None - header = asbytes(header) - fmt = _header_size_formats[version] + fmt, encoding = _header_size_info[version] + if not isinstance(header, bytes): # always true on python 3 + header = header.encode(encoding) hlen = len(header) + 1 padlen = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize(fmt) + hlen) % ARRAY_ALIGN) try: @@ -360,9 +369,19 @@ def _wrap_header_guess_version(header): return _wrap_header(header, (1, 0)) except ValueError: pass - header = _wrap_header(header, (2, 0)) - warnings.warn("Stored array in format 2.0. It can only be" - "read by NumPy >= 1.9", UserWarning, stacklevel=2) + + try: + ret = _wrap_header(header, (2, 0)) + except UnicodeEncodeError: + pass + else: + warnings.warn("Stored array in format 2.0. It can only be" + "read by NumPy >= 1.9", UserWarning, stacklevel=2) + return ret + + header = _wrap_header(header, (3, 0)) + warnings.warn("Stored array in format 3.0. It can only be" + "read by NumPy >= 1.17", UserWarning, stacklevel=2) return header @@ -494,7 +513,7 @@ def _filter_header(s): Parameters ---------- - s : byte string + s : string Npy file header. Returns @@ -512,7 +531,7 @@ def _filter_header(s): tokens = [] last_token_was_number = False # adding newline as python 2.7.5 workaround - string = asstr(s) + "\n" + string = s + "\n" for token in tokenize.generate_tokens(StringIO(string).readline): token_type = token[0] token_string = token[1] @@ -534,13 +553,15 @@ def _read_array_header(fp, version): # Read an unsigned, little-endian short int which has the length of the # header. import struct - hlength_type = _header_size_formats.get(version) - if hlength_type is None: + hinfo = _header_size_info.get(version) + if hinfo is None: raise ValueError("Invalid version {!r}".format(version)) + hlength_type, encoding = hinfo hlength_str = _read_bytes(fp, struct.calcsize(hlength_type), "array header length") header_length = struct.unpack(hlength_type, hlength_str)[0] header = _read_bytes(fp, header_length, "array header") + header = header.decode(encoding) # The header is a pretty-printed string representation of a literal # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py index 2ebd483d5..2cf799723 100644 --- a/numpy/lib/tests/test_format.py +++ b/numpy/lib/tests/test_format.py @@ -287,6 +287,7 @@ from io import BytesIO import numpy as np from numpy.testing import ( assert_, assert_array_equal, assert_raises, assert_raises_regex, + assert_warns ) from numpy.lib import format @@ -882,3 +883,27 @@ def test_empty_npz(): fname = os.path.join(tempdir, "nothing.npz") np.savez(fname) np.load(fname) + + +def test_unicode_field_names(): + # gh-7391 + arr = np.array([ + (1, 3), + (1, 2), + (1, 3), + (1, 2) + ], dtype=[ + ('int', int), + (u'\N{CJK UNIFIED IDEOGRAPH-6574}\N{CJK UNIFIED IDEOGRAPH-5F62}', int) + ]) + fname = os.path.join(tempdir, "unicode.npy") + with open(fname, 'wb') as f: + format.write_array(f, arr, version=(3, 0)) + with open(fname, 'rb') as f: + arr2 = format.read_array(f) + assert_array_equal(arr, arr2) + + # notifies the user that 3.0 is selected + with open(fname, 'wb') as f: + with assert_warns(UserWarning): + format.write_array(f, arr, version=None) |