DEP: Letting fromstring pretend to be frombuffer is a bad idea

Interpreting a unicode string as raw binary data is a terrible idea, especially if the encoding is determined by the system (python 2)
author: Eric Wieser <wieser.eric@gmail.com> 2017-07-28 20:59:45 +0100
committer: Eric Wieser <wieser.eric@gmail.com> 2017-10-19 00:26:44 -0700
commit: 6bc01b4e6187c8689e9e82e020a47b765e4f8fb3 (patch)
tree: 3680ab354d11faad7f5b1720c58cbc351563215a
parent: e657629bbc2bfb880a1b2fa24a39c5921c1f965e (diff)
download: numpy-6bc01b4e6187c8689e9e82e020a47b765e4f8fb3.tar.gz
5 files changed, 44 insertions, 21 deletions
diff --git a/doc/release/1.14.0-notes.rst b/doc/release/1.14.0-notes.rst
index 90dae9695..8ab78c884 100644
--- a/doc/release/1.14.0-notes.rst
+++ b/doc/release/1.14.0-notes.rst
@@ -25,6 +25,12 @@ Deprecations
   empty, use ``array.size > 0``.
 * Calling ``np.bincount`` with ``minlength=None`` is deprecated - instead,
   ``minlength=0`` should be used.
+``np.fromstring`` should always be passed a ``sep`` argument
+------------------------------------------------------------
+Without this argument, this falls back on a broken version of `np.frombuffer`
+that silently accepts and then encode unicode strings. If reading binary data
+is desired, ``frombuffer`` should be used directly.
+
 
 Future Changes
 ==============
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index ce2f1c6ec..7175a0e0b 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -961,7 +961,7 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
     """
     fromstring(string, dtype=float, count=-1, sep='')
 
-    A new 1-D array initialized from raw binary or text data in a string.
+    A new 1-D array initialized from text data in a string.
 
     Parameters
     ----------
@@ -975,11 +975,13 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
         negative (the default), the count will be determined from the
         length of the data.
     sep : str, optional
-        If not provided or, equivalently, the empty string, the data will
-        be interpreted as binary data; otherwise, as ASCII text with
-        decimal numbers.  Also in this latter case, this argument is
-        interpreted as the string separating numbers in the data; extra
-        whitespace between elements is also ignored.
+        The string separating numbers in the data; extra whitespace between
+        elements is also ignored.
+
+        .. deprecated:: 1.14
+            If this argument is not provided `fromstring` falls back on the
+            behaviour of `frombuffer`, after encoding unicode string inputs as
+            either utf-8 (python 3), or the default encoding (python 2).
 
     Returns
     -------
@@ -998,14 +1000,10 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
 
     Examples
     --------
-    >>> np.fromstring('\\x01\\x02', dtype=np.uint8)
-    array([1, 2], dtype=uint8)
     >>> np.fromstring('1 2', dtype=int, sep=' ')
     array([1, 2])
     >>> np.fromstring('1, 2', dtype=int, sep=',')
     array([1, 2])
-    >>> np.fromstring('\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3)
-    array([1, 2, 3], dtype=uint8)
 
     """)
 
@@ -1154,6 +1152,11 @@ add_newdoc('numpy.core.multiarray', 'frombuffer',
     array(['w', 'o', 'r', 'l', 'd'],
           dtype='|S1')
 
+    >>> np.frombuffer(b'\\x01\\x02', dtype=np.uint8)
+    array([1, 2], dtype=uint8)
+    >>> np.frombuffer(b'\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3)
+    array([1, 2, 3], dtype=uint8)
+
     """)
 
 add_newdoc('numpy.core.multiarray', 'concatenate',
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 210882ff0..600a5ac32 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -2098,6 +2098,17 @@ array_fromstring(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
         Py_XDECREF(descr);
         return NULL;
     }
+
+    // binary mode, condition copied from PyArray_FromString
+    if (sep == NULL || strlen(sep) == 0) {
+        /* Numpy 1.14, 2017-10-19 */
+        if (DEPRECATE(
+                "The binary mode of fromstring is deprecated, as it behaves "
+                "surprisingly on unicode inputs. Use frombuffer instead") < 0) {
+            Py_DECREF(descr);
+            return NULL;
+        }
+    }
     return PyArray_FromString(data, (npy_intp)s, descr, (npy_intp)nin, sep);
 }
 
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b1a6fbe44..3a5ca8d6d 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4183,11 +4183,11 @@ class TestIO(object):
 
     def test_roundtrip_binary_str(self):
         s = self.x.tobytes()
-        y = np.fromstring(s, dtype=self.dtype)
+        y = np.frombuffer(s, dtype=self.dtype)
         assert_array_equal(y, self.x.flat)
 
         s = self.x.tobytes('F')
-        y = np.fromstring(s, dtype=self.dtype)
+        y = np.frombuffer(s, dtype=self.dtype)
         assert_array_equal(y, self.x.flatten('F'))
 
     def test_roundtrip_str(self):
@@ -4302,7 +4302,10 @@ class TestIO(object):
             assert_equal(pos, 10, err_msg=err_msg)
 
     def _check_from(self, s, value, **kw):
-        y = np.fromstring(s, **kw)
+        if 'sep' not in kw:
+            y = np.frombuffer(s, **kw)
+        else:
+            y = np.fromstring(s, **kw)
         assert_array_equal(y, value)
 
         f = open(self.filename, 'wb')
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 34f9080fb..f791f6725 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -289,7 +289,7 @@ class TestRegression(object):
         # Fix in r2836
         # Create non-contiguous Fortran ordered array
         x = np.array(np.random.rand(3, 3), order='F')[:, :2]
-        assert_array_almost_equal(x.ravel(), np.fromstring(x.tobytes()))
+        assert_array_almost_equal(x.ravel(), np.frombuffer(x.tobytes()))
 
     def test_flat_assignment(self):
         # Correct behaviour of ticket #194
@@ -833,14 +833,14 @@ class TestRegression(object):
 
     def test_string_argsort_with_zeros(self):
         # Check argsort for strings containing zeros.
-        x = np.fromstring("\x00\x02\x00\x01", dtype="|S2")
+        x = np.frombuffer(b"\x00\x02\x00\x01", dtype="|S2")
         assert_array_equal(x.argsort(kind='m'), np.array([1, 0]))
         assert_array_equal(x.argsort(kind='q'), np.array([1, 0]))
 
     def test_string_sort_with_zeros(self):
         # Check sort for strings containing zeros.
-        x = np.fromstring("\x00\x02\x00\x01", dtype="|S2")
-        y = np.fromstring("\x00\x01\x00\x02", dtype="|S2")
+        x = np.frombuffer(b"\x00\x02\x00\x01", dtype="|S2")
+        y = np.frombuffer(b"\x00\x01\x00\x02", dtype="|S2")
         assert_array_equal(np.sort(x, kind="q"), y)
 
     def test_copy_detection_zero_dim(self):
@@ -1430,10 +1430,10 @@ class TestRegression(object):
             y = x.byteswap()
             if x.dtype.byteorder == z.dtype.byteorder:
                 # little-endian machine
-                assert_equal(x, np.fromstring(y.tobytes(), dtype=dtype.newbyteorder()))
+                assert_equal(x, np.frombuffer(y.tobytes(), dtype=dtype.newbyteorder()))
             else:
                 # big-endian machine
-                assert_equal(x, np.fromstring(y.tobytes(), dtype=dtype))
+                assert_equal(x, np.frombuffer(y.tobytes(), dtype=dtype))
             # double check real and imaginary parts:
             assert_equal(x.real, y.real.byteswap())
             assert_equal(x.imag, y.imag.byteswap())
@@ -1783,8 +1783,8 @@ class TestRegression(object):
             assert_equal(a1, a2)
 
     def test_fields_strides(self):
-        "Ticket #1760"
-        r = np.fromstring('abcdefghijklmnop'*4*3, dtype='i4,(2,3)u2')
+        "gh-2355"
+        r = np.frombuffer(b'abcdefghijklmnop'*4*3, dtype='i4,(2,3)u2')
         assert_equal(r[0:3:2]['f1'], r['f1'][0:3:2])
         assert_equal(r[0:3:2]['f1'][0], r[0:3:2][0]['f1'])
         assert_equal(r[0:3:2]['f1'][0][()], r[0:3:2][0]['f1'][()])
author	Eric Wieser <wieser.eric@gmail.com>	2017-07-28 20:59:45 +0100
committer	Eric Wieser <wieser.eric@gmail.com>	2017-10-19 00:26:44 -0700
commit	6bc01b4e6187c8689e9e82e020a47b765e4f8fb3 (patch)
tree	3680ab354d11faad7f5b1720c58cbc351563215a
parent	e657629bbc2bfb880a1b2fa24a39c5921c1f965e (diff)
download	numpy-6bc01b4e6187c8689e9e82e020a47b765e4f8fb3.tar.gz