diff options
author | Eric Wieser <wieser.eric@gmail.com> | 2017-07-28 20:59:45 +0100 |
---|---|---|
committer | Eric Wieser <wieser.eric@gmail.com> | 2017-10-19 00:26:44 -0700 |
commit | 6bc01b4e6187c8689e9e82e020a47b765e4f8fb3 (patch) | |
tree | 3680ab354d11faad7f5b1720c58cbc351563215a | |
parent | e657629bbc2bfb880a1b2fa24a39c5921c1f965e (diff) | |
download | numpy-6bc01b4e6187c8689e9e82e020a47b765e4f8fb3.tar.gz |
DEP: Letting fromstring pretend to be frombuffer is a bad idea
Interpreting a unicode string as raw binary data is a terrible idea, especially if the encoding is determined by the system (python 2)
-rw-r--r-- | doc/release/1.14.0-notes.rst | 6 | ||||
-rw-r--r-- | numpy/add_newdocs.py | 23 | ||||
-rw-r--r-- | numpy/core/src/multiarray/multiarraymodule.c | 11 | ||||
-rw-r--r-- | numpy/core/tests/test_multiarray.py | 9 | ||||
-rw-r--r-- | numpy/core/tests/test_regression.py | 16 |
5 files changed, 44 insertions, 21 deletions
diff --git a/doc/release/1.14.0-notes.rst b/doc/release/1.14.0-notes.rst index 90dae9695..8ab78c884 100644 --- a/doc/release/1.14.0-notes.rst +++ b/doc/release/1.14.0-notes.rst @@ -25,6 +25,12 @@ Deprecations empty, use ``array.size > 0``. * Calling ``np.bincount`` with ``minlength=None`` is deprecated - instead, ``minlength=0`` should be used. +``np.fromstring`` should always be passed a ``sep`` argument +------------------------------------------------------------ +Without this argument, this falls back on a broken version of `np.frombuffer` +that silently accepts and then encode unicode strings. If reading binary data +is desired, ``frombuffer`` should be used directly. + Future Changes ============== diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py index ce2f1c6ec..7175a0e0b 100644 --- a/numpy/add_newdocs.py +++ b/numpy/add_newdocs.py @@ -961,7 +961,7 @@ add_newdoc('numpy.core.multiarray', 'fromstring', """ fromstring(string, dtype=float, count=-1, sep='') - A new 1-D array initialized from raw binary or text data in a string. + A new 1-D array initialized from text data in a string. Parameters ---------- @@ -975,11 +975,13 @@ add_newdoc('numpy.core.multiarray', 'fromstring', negative (the default), the count will be determined from the length of the data. sep : str, optional - If not provided or, equivalently, the empty string, the data will - be interpreted as binary data; otherwise, as ASCII text with - decimal numbers. Also in this latter case, this argument is - interpreted as the string separating numbers in the data; extra - whitespace between elements is also ignored. + The string separating numbers in the data; extra whitespace between + elements is also ignored. + + .. deprecated:: 1.14 + If this argument is not provided `fromstring` falls back on the + behaviour of `frombuffer`, after encoding unicode string inputs as + either utf-8 (python 3), or the default encoding (python 2). Returns ------- @@ -998,14 +1000,10 @@ add_newdoc('numpy.core.multiarray', 'fromstring', Examples -------- - >>> np.fromstring('\\x01\\x02', dtype=np.uint8) - array([1, 2], dtype=uint8) >>> np.fromstring('1 2', dtype=int, sep=' ') array([1, 2]) >>> np.fromstring('1, 2', dtype=int, sep=',') array([1, 2]) - >>> np.fromstring('\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3) - array([1, 2, 3], dtype=uint8) """) @@ -1154,6 +1152,11 @@ add_newdoc('numpy.core.multiarray', 'frombuffer', array(['w', 'o', 'r', 'l', 'd'], dtype='|S1') + >>> np.frombuffer(b'\\x01\\x02', dtype=np.uint8) + array([1, 2], dtype=uint8) + >>> np.frombuffer(b'\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3) + array([1, 2, 3], dtype=uint8) + """) add_newdoc('numpy.core.multiarray', 'concatenate', diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 210882ff0..600a5ac32 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -2098,6 +2098,17 @@ array_fromstring(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds Py_XDECREF(descr); return NULL; } + + // binary mode, condition copied from PyArray_FromString + if (sep == NULL || strlen(sep) == 0) { + /* Numpy 1.14, 2017-10-19 */ + if (DEPRECATE( + "The binary mode of fromstring is deprecated, as it behaves " + "surprisingly on unicode inputs. Use frombuffer instead") < 0) { + Py_DECREF(descr); + return NULL; + } + } return PyArray_FromString(data, (npy_intp)s, descr, (npy_intp)nin, sep); } diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index b1a6fbe44..3a5ca8d6d 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -4183,11 +4183,11 @@ class TestIO(object): def test_roundtrip_binary_str(self): s = self.x.tobytes() - y = np.fromstring(s, dtype=self.dtype) + y = np.frombuffer(s, dtype=self.dtype) assert_array_equal(y, self.x.flat) s = self.x.tobytes('F') - y = np.fromstring(s, dtype=self.dtype) + y = np.frombuffer(s, dtype=self.dtype) assert_array_equal(y, self.x.flatten('F')) def test_roundtrip_str(self): @@ -4302,7 +4302,10 @@ class TestIO(object): assert_equal(pos, 10, err_msg=err_msg) def _check_from(self, s, value, **kw): - y = np.fromstring(s, **kw) + if 'sep' not in kw: + y = np.frombuffer(s, **kw) + else: + y = np.fromstring(s, **kw) assert_array_equal(y, value) f = open(self.filename, 'wb') diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py index 34f9080fb..f791f6725 100644 --- a/numpy/core/tests/test_regression.py +++ b/numpy/core/tests/test_regression.py @@ -289,7 +289,7 @@ class TestRegression(object): # Fix in r2836 # Create non-contiguous Fortran ordered array x = np.array(np.random.rand(3, 3), order='F')[:, :2] - assert_array_almost_equal(x.ravel(), np.fromstring(x.tobytes())) + assert_array_almost_equal(x.ravel(), np.frombuffer(x.tobytes())) def test_flat_assignment(self): # Correct behaviour of ticket #194 @@ -833,14 +833,14 @@ class TestRegression(object): def test_string_argsort_with_zeros(self): # Check argsort for strings containing zeros. - x = np.fromstring("\x00\x02\x00\x01", dtype="|S2") + x = np.frombuffer(b"\x00\x02\x00\x01", dtype="|S2") assert_array_equal(x.argsort(kind='m'), np.array([1, 0])) assert_array_equal(x.argsort(kind='q'), np.array([1, 0])) def test_string_sort_with_zeros(self): # Check sort for strings containing zeros. - x = np.fromstring("\x00\x02\x00\x01", dtype="|S2") - y = np.fromstring("\x00\x01\x00\x02", dtype="|S2") + x = np.frombuffer(b"\x00\x02\x00\x01", dtype="|S2") + y = np.frombuffer(b"\x00\x01\x00\x02", dtype="|S2") assert_array_equal(np.sort(x, kind="q"), y) def test_copy_detection_zero_dim(self): @@ -1430,10 +1430,10 @@ class TestRegression(object): y = x.byteswap() if x.dtype.byteorder == z.dtype.byteorder: # little-endian machine - assert_equal(x, np.fromstring(y.tobytes(), dtype=dtype.newbyteorder())) + assert_equal(x, np.frombuffer(y.tobytes(), dtype=dtype.newbyteorder())) else: # big-endian machine - assert_equal(x, np.fromstring(y.tobytes(), dtype=dtype)) + assert_equal(x, np.frombuffer(y.tobytes(), dtype=dtype)) # double check real and imaginary parts: assert_equal(x.real, y.real.byteswap()) assert_equal(x.imag, y.imag.byteswap()) @@ -1783,8 +1783,8 @@ class TestRegression(object): assert_equal(a1, a2) def test_fields_strides(self): - "Ticket #1760" - r = np.fromstring('abcdefghijklmnop'*4*3, dtype='i4,(2,3)u2') + "gh-2355" + r = np.frombuffer(b'abcdefghijklmnop'*4*3, dtype='i4,(2,3)u2') assert_equal(r[0:3:2]['f1'], r['f1'][0:3:2]) assert_equal(r[0:3:2]['f1'][0], r[0:3:2][0]['f1']) assert_equal(r[0:3:2]['f1'][0][()], r[0:3:2][0]['f1'][()]) |