summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Wieser <wieser.eric@gmail.com>2017-07-28 20:59:45 +0100
committerEric Wieser <wieser.eric@gmail.com>2017-10-19 00:26:44 -0700
commit6bc01b4e6187c8689e9e82e020a47b765e4f8fb3 (patch)
tree3680ab354d11faad7f5b1720c58cbc351563215a
parente657629bbc2bfb880a1b2fa24a39c5921c1f965e (diff)
downloadnumpy-6bc01b4e6187c8689e9e82e020a47b765e4f8fb3.tar.gz
DEP: Letting fromstring pretend to be frombuffer is a bad idea
Interpreting a unicode string as raw binary data is a terrible idea, especially if the encoding is determined by the system (python 2)
-rw-r--r--doc/release/1.14.0-notes.rst6
-rw-r--r--numpy/add_newdocs.py23
-rw-r--r--numpy/core/src/multiarray/multiarraymodule.c11
-rw-r--r--numpy/core/tests/test_multiarray.py9
-rw-r--r--numpy/core/tests/test_regression.py16
5 files changed, 44 insertions, 21 deletions
diff --git a/doc/release/1.14.0-notes.rst b/doc/release/1.14.0-notes.rst
index 90dae9695..8ab78c884 100644
--- a/doc/release/1.14.0-notes.rst
+++ b/doc/release/1.14.0-notes.rst
@@ -25,6 +25,12 @@ Deprecations
empty, use ``array.size > 0``.
* Calling ``np.bincount`` with ``minlength=None`` is deprecated - instead,
``minlength=0`` should be used.
+``np.fromstring`` should always be passed a ``sep`` argument
+------------------------------------------------------------
+Without this argument, this falls back on a broken version of `np.frombuffer`
+that silently accepts and then encode unicode strings. If reading binary data
+is desired, ``frombuffer`` should be used directly.
+
Future Changes
==============
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index ce2f1c6ec..7175a0e0b 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -961,7 +961,7 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
"""
fromstring(string, dtype=float, count=-1, sep='')
- A new 1-D array initialized from raw binary or text data in a string.
+ A new 1-D array initialized from text data in a string.
Parameters
----------
@@ -975,11 +975,13 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
negative (the default), the count will be determined from the
length of the data.
sep : str, optional
- If not provided or, equivalently, the empty string, the data will
- be interpreted as binary data; otherwise, as ASCII text with
- decimal numbers. Also in this latter case, this argument is
- interpreted as the string separating numbers in the data; extra
- whitespace between elements is also ignored.
+ The string separating numbers in the data; extra whitespace between
+ elements is also ignored.
+
+ .. deprecated:: 1.14
+ If this argument is not provided `fromstring` falls back on the
+ behaviour of `frombuffer`, after encoding unicode string inputs as
+ either utf-8 (python 3), or the default encoding (python 2).
Returns
-------
@@ -998,14 +1000,10 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
Examples
--------
- >>> np.fromstring('\\x01\\x02', dtype=np.uint8)
- array([1, 2], dtype=uint8)
>>> np.fromstring('1 2', dtype=int, sep=' ')
array([1, 2])
>>> np.fromstring('1, 2', dtype=int, sep=',')
array([1, 2])
- >>> np.fromstring('\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3)
- array([1, 2, 3], dtype=uint8)
""")
@@ -1154,6 +1152,11 @@ add_newdoc('numpy.core.multiarray', 'frombuffer',
array(['w', 'o', 'r', 'l', 'd'],
dtype='|S1')
+ >>> np.frombuffer(b'\\x01\\x02', dtype=np.uint8)
+ array([1, 2], dtype=uint8)
+ >>> np.frombuffer(b'\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3)
+ array([1, 2, 3], dtype=uint8)
+
""")
add_newdoc('numpy.core.multiarray', 'concatenate',
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 210882ff0..600a5ac32 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -2098,6 +2098,17 @@ array_fromstring(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
Py_XDECREF(descr);
return NULL;
}
+
+ // binary mode, condition copied from PyArray_FromString
+ if (sep == NULL || strlen(sep) == 0) {
+ /* Numpy 1.14, 2017-10-19 */
+ if (DEPRECATE(
+ "The binary mode of fromstring is deprecated, as it behaves "
+ "surprisingly on unicode inputs. Use frombuffer instead") < 0) {
+ Py_DECREF(descr);
+ return NULL;
+ }
+ }
return PyArray_FromString(data, (npy_intp)s, descr, (npy_intp)nin, sep);
}
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b1a6fbe44..3a5ca8d6d 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4183,11 +4183,11 @@ class TestIO(object):
def test_roundtrip_binary_str(self):
s = self.x.tobytes()
- y = np.fromstring(s, dtype=self.dtype)
+ y = np.frombuffer(s, dtype=self.dtype)
assert_array_equal(y, self.x.flat)
s = self.x.tobytes('F')
- y = np.fromstring(s, dtype=self.dtype)
+ y = np.frombuffer(s, dtype=self.dtype)
assert_array_equal(y, self.x.flatten('F'))
def test_roundtrip_str(self):
@@ -4302,7 +4302,10 @@ class TestIO(object):
assert_equal(pos, 10, err_msg=err_msg)
def _check_from(self, s, value, **kw):
- y = np.fromstring(s, **kw)
+ if 'sep' not in kw:
+ y = np.frombuffer(s, **kw)
+ else:
+ y = np.fromstring(s, **kw)
assert_array_equal(y, value)
f = open(self.filename, 'wb')
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 34f9080fb..f791f6725 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -289,7 +289,7 @@ class TestRegression(object):
# Fix in r2836
# Create non-contiguous Fortran ordered array
x = np.array(np.random.rand(3, 3), order='F')[:, :2]
- assert_array_almost_equal(x.ravel(), np.fromstring(x.tobytes()))
+ assert_array_almost_equal(x.ravel(), np.frombuffer(x.tobytes()))
def test_flat_assignment(self):
# Correct behaviour of ticket #194
@@ -833,14 +833,14 @@ class TestRegression(object):
def test_string_argsort_with_zeros(self):
# Check argsort for strings containing zeros.
- x = np.fromstring("\x00\x02\x00\x01", dtype="|S2")
+ x = np.frombuffer(b"\x00\x02\x00\x01", dtype="|S2")
assert_array_equal(x.argsort(kind='m'), np.array([1, 0]))
assert_array_equal(x.argsort(kind='q'), np.array([1, 0]))
def test_string_sort_with_zeros(self):
# Check sort for strings containing zeros.
- x = np.fromstring("\x00\x02\x00\x01", dtype="|S2")
- y = np.fromstring("\x00\x01\x00\x02", dtype="|S2")
+ x = np.frombuffer(b"\x00\x02\x00\x01", dtype="|S2")
+ y = np.frombuffer(b"\x00\x01\x00\x02", dtype="|S2")
assert_array_equal(np.sort(x, kind="q"), y)
def test_copy_detection_zero_dim(self):
@@ -1430,10 +1430,10 @@ class TestRegression(object):
y = x.byteswap()
if x.dtype.byteorder == z.dtype.byteorder:
# little-endian machine
- assert_equal(x, np.fromstring(y.tobytes(), dtype=dtype.newbyteorder()))
+ assert_equal(x, np.frombuffer(y.tobytes(), dtype=dtype.newbyteorder()))
else:
# big-endian machine
- assert_equal(x, np.fromstring(y.tobytes(), dtype=dtype))
+ assert_equal(x, np.frombuffer(y.tobytes(), dtype=dtype))
# double check real and imaginary parts:
assert_equal(x.real, y.real.byteswap())
assert_equal(x.imag, y.imag.byteswap())
@@ -1783,8 +1783,8 @@ class TestRegression(object):
assert_equal(a1, a2)
def test_fields_strides(self):
- "Ticket #1760"
- r = np.fromstring('abcdefghijklmnop'*4*3, dtype='i4,(2,3)u2')
+ "gh-2355"
+ r = np.frombuffer(b'abcdefghijklmnop'*4*3, dtype='i4,(2,3)u2')
assert_equal(r[0:3:2]['f1'], r['f1'][0:3:2])
assert_equal(r[0:3:2]['f1'][0], r[0:3:2][0]['f1'])
assert_equal(r[0:3:2]['f1'][0][()], r[0:3:2][0]['f1'][()])