summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2019-09-13 09:25:21 +0300
committerGitHub <noreply@github.com>2019-09-13 09:25:21 +0300
commit79ee678fb2f1518b3f45e26066a02b45b22d456c (patch)
treee8b8f529c7cfcf9451e8b12655291a5d7279e2e5
parente97975a2027e81d93ccdc96ab767653773b3cc04 (diff)
parentbf6809a63238dd50b2a42a7d62824f1e438a222a (diff)
downloadnumpy-79ee678fb2f1518b3f45e26066a02b45b22d456c.tar.gz
Merge pull request #13605 from seberg/fromfile-eof-deprecation
DEP: Deprecate silent ignoring of bad data in fromfile/fromstring
-rw-r--r--changelog/13605.deprecation.rst9
-rw-r--r--numpy/core/src/multiarray/_multiarray_tests.c.src22
-rw-r--r--numpy/core/src/multiarray/ctors.c78
-rw-r--r--numpy/core/tests/test_deprecations.py60
-rw-r--r--numpy/core/tests/test_longdouble.py38
-rw-r--r--numpy/core/tests/test_multiarray.py3
-rw-r--r--numpy/core/tests/test_regression.py3
7 files changed, 183 insertions, 30 deletions
diff --git a/changelog/13605.deprecation.rst b/changelog/13605.deprecation.rst
new file mode 100644
index 000000000..bff12e965
--- /dev/null
+++ b/changelog/13605.deprecation.rst
@@ -0,0 +1,9 @@
+`np.fromfile` and `np.fromstring` will error on bad data
+--------------------------------------------------------
+
+In future numpy releases, the functions `np.fromfile` and `np.fromstring`
+will throw an error when parsing bad data.
+This will now give a ``DeprecationWarning`` where previously partial or
+even invalid data was silently returned. This deprecation also affects
+the C defined functions c:func:`PyArray_FromString`` and
+c:func:`PyArray_FromFile`
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index 1365e87bb..b0985c80f 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -593,6 +593,25 @@ fail:
return NULL;
}
+/*
+ * Helper to test fromstring of 0 terminated strings, as the C-API supports
+ * the -1 length identifier.
+ */
+static PyObject *
+fromstring_null_term_c_api(PyObject *dummy, PyObject *byte_obj)
+{
+ char *string;
+
+ string = PyBytes_AsString(byte_obj);
+ if (string == NULL) {
+ return NULL;
+ }
+
+ return PyArray_FromString(
+ string, -1, PyArray_DescrFromType(NPY_FLOAT64), -1, " ");
+}
+
+
/* check no elison for avoided increfs */
static PyObject *
incref_elide(PyObject *dummy, PyObject *args)
@@ -1927,6 +1946,9 @@ static PyMethodDef Multiarray_TestsMethods[] = {
{"test_inplace_increment",
inplace_increment,
METH_VARARGS, NULL},
+ {"fromstring_null_term_c_api",
+ fromstring_null_term_c_api,
+ METH_O, NULL},
{"incref_elide",
incref_elide,
METH_VARARGS, NULL},
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 50d58ac16..188e68001 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -40,9 +40,31 @@
* regards to the handling of text representations.
*/
+/*
+ * Scanning function for next element parsing and seperator skipping.
+ * These functions return:
+ * - 0 to indicate more data to read
+ * - -1 when reading stopped at the end of the string/file
+ * - -2 when reading stopped before the end was reached.
+ *
+ * The dtype specific parsing functions may set the python error state
+ * (they have to get the GIL first) additionally.
+ */
typedef int (*next_element)(void **, void *, PyArray_Descr *, void *);
typedef int (*skip_separator)(void **, const char *, void *);
+
+static npy_bool
+string_is_fully_read(char const* start, char const* end) {
+ if (end == NULL) {
+ return *start == '\0'; /* null terminated */
+ }
+ else {
+ return start >= end; /* fixed length */
+ }
+}
+
+
static int
fromstr_next_element(char **s, void *dptr, PyArray_Descr *dtype,
const char *end)
@@ -50,19 +72,23 @@ fromstr_next_element(char **s, void *dptr, PyArray_Descr *dtype,
char *e = *s;
int r = dtype->f->fromstr(*s, dptr, &e, dtype);
/*
- * fromstr always returns 0 for basic dtypes
- * s points to the end of the parsed string
- * if an error occurs s is not changed
+ * fromstr always returns 0 for basic dtypes; s points to the end of the
+ * parsed string. If s is not changed an error occurred or the end was
+ * reached.
*/
- if (*s == e) {
- /* Nothing read */
- return -1;
+ if (*s == e || r < 0) {
+ /* Nothing read, could be end of string or an error (or both) */
+ if (string_is_fully_read(*s, end)) {
+ return -1;
+ }
+ return -2;
}
*s = e;
if (end != NULL && *s > end) {
+ /* Stop the iteration if we read far enough */
return -1;
}
- return r;
+ return 0;
}
static int
@@ -75,9 +101,13 @@ fromfile_next_element(FILE **fp, void *dptr, PyArray_Descr *dtype,
if (r == 1) {
return 0;
}
- else {
+ else if (r == EOF) {
return -1;
}
+ else {
+ /* unable to read more, but EOF not reached indicating an error. */
+ return -2;
+ }
}
/*
@@ -143,9 +173,10 @@ fromstr_skip_separator(char **s, const char *sep, const char *end)
{
char *string = *s;
int result = 0;
+
while (1) {
char c = *string;
- if (c == '\0' || (end != NULL && string >= end)) {
+ if (string_is_fully_read(string, end)) {
result = -1;
break;
}
@@ -3596,6 +3627,7 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
npy_intp i;
char *dptr, *clean_sep, *tmp;
int err = 0;
+ int stop_reading_flag; /* -1 indicates end reached; -2 a parsing error */
npy_intp thisbuf = 0;
npy_intp size;
npy_intp bytes, totalbytes;
@@ -3623,9 +3655,9 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
NPY_BEGIN_ALLOW_THREADS;
totalbytes = bytes = size * dtype->elsize;
dptr = PyArray_DATA(r);
- for (i= 0; num < 0 || i < num; i++) {
- if (next(&stream, dptr, dtype, stream_data) < 0) {
- /* EOF */
+ for (i = 0; num < 0 || i < num; i++) {
+ stop_reading_flag = next(&stream, dptr, dtype, stream_data);
+ if (stop_reading_flag < 0) {
break;
}
*nread += 1;
@@ -3642,7 +3674,12 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
dptr = tmp + (totalbytes - bytes);
thisbuf = 0;
}
- if (skip_sep(&stream, clean_sep, stream_data) < 0) {
+ stop_reading_flag = skip_sep(&stream, clean_sep, stream_data);
+ if (stop_reading_flag < 0) {
+ if (num == i + 1) {
+ /* if we read as much as requested sep is optional */
+ stop_reading_flag = -1;
+ }
break;
}
}
@@ -3661,6 +3698,21 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
}
}
NPY_END_ALLOW_THREADS;
+
+ if (stop_reading_flag == -2) {
+ if (PyErr_Occurred()) {
+ /* If an error is already set (unlikely), do not create new one */
+ Py_DECREF(r);
+ return NULL;
+ }
+ /* 2019-09-12, NumPy 1.18 */
+ if (DEPRECATE(
+ "string or file could not be read to its end due to unmatched "
+ "data; this will raise a ValueError in the future.") < 0) {
+ goto fail;
+ }
+ }
+
free(clean_sep);
fail:
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index e8aa0c70b..46cebdd31 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -10,12 +10,16 @@ import sys
import operator
import warnings
import pytest
+import shutil
+import tempfile
import numpy as np
from numpy.testing import (
- assert_raises, assert_warns, assert_
+ assert_raises, assert_warns, assert_, assert_array_equal
)
+from numpy.core._multiarray_tests import fromstring_null_term_c_api
+
try:
import pytz
_has_pytz = True
@@ -514,11 +518,65 @@ class TestPositiveOnNonNumerical(_DeprecationTestCase):
def test_positive_on_non_number(self):
self.assert_deprecated(operator.pos, args=(np.array('foo'),))
+
class TestFromstring(_DeprecationTestCase):
# 2017-10-19, 1.14
def test_fromstring(self):
self.assert_deprecated(np.fromstring, args=('\x00'*80,))
+
+class TestFromStringAndFileInvalidData(_DeprecationTestCase):
+ # 2019-06-08, 1.17.0
+ # Tests should be moved to real tests when deprecation is done.
+ message = "string or file could not be read to its end"
+
+ @pytest.mark.parametrize("invalid_str", [",invalid_data", "invalid_sep"])
+ def test_deprecate_unparsable_data_file(self, invalid_str):
+ x = np.array([1.51, 2, 3.51, 4], dtype=float)
+
+ with tempfile.TemporaryFile(mode="w") as f:
+ x.tofile(f, sep=',', format='%.2f')
+ f.write(invalid_str)
+
+ f.seek(0)
+ self.assert_deprecated(lambda: np.fromfile(f, sep=","))
+ f.seek(0)
+ self.assert_deprecated(lambda: np.fromfile(f, sep=",", count=5))
+ # Should not raise:
+ with warnings.catch_warnings():
+ warnings.simplefilter("error", DeprecationWarning)
+ f.seek(0)
+ res = np.fromfile(f, sep=",", count=4)
+ assert_array_equal(res, x)
+
+ @pytest.mark.parametrize("invalid_str", [",invalid_data", "invalid_sep"])
+ def test_deprecate_unparsable_string(self, invalid_str):
+ x = np.array([1.51, 2, 3.51, 4], dtype=float)
+ x_str = "1.51,2,3.51,4{}".format(invalid_str)
+
+ self.assert_deprecated(lambda: np.fromstring(x_str, sep=","))
+ self.assert_deprecated(lambda: np.fromstring(x_str, sep=",", count=5))
+
+ # The C-level API can use not fixed size, but 0 terminated strings,
+ # so test that as well:
+ bytestr = x_str.encode("ascii")
+ self.assert_deprecated(lambda: fromstring_null_term_c_api(bytestr))
+
+ with assert_warns(DeprecationWarning):
+ # this is slightly strange, in that fromstring leaves data
+ # potentially uninitialized (would be good to error when all is
+ # read, but count is larger then actual data maybe).
+ res = np.fromstring(x_str, sep=",", count=5)
+ assert_array_equal(res[:-1], x)
+
+ with warnings.catch_warnings():
+ warnings.simplefilter("error", DeprecationWarning)
+
+ # Should not raise:
+ res = np.fromstring(x_str, sep=",", count=4)
+ assert_array_equal(res, x)
+
+
class Test_GetSet_NumericOps(_DeprecationTestCase):
# 2018-09-20, 1.16.0
def test_get_numeric_ops(self):
diff --git a/numpy/core/tests/test_longdouble.py b/numpy/core/tests/test_longdouble.py
index ee4197f8f..59ac5923c 100644
--- a/numpy/core/tests/test_longdouble.py
+++ b/numpy/core/tests/test_longdouble.py
@@ -5,7 +5,8 @@ import pytest
import numpy as np
from numpy.testing import (
- assert_, assert_equal, assert_raises, assert_array_equal, temppath,
+ assert_, assert_equal, assert_raises, assert_warns, assert_array_equal,
+ temppath,
)
from numpy.core.tests._locales import CommaDecimalPointLocale
@@ -71,18 +72,21 @@ def test_fromstring():
def test_fromstring_bogus():
- assert_equal(np.fromstring("1. 2. 3. flop 4.", dtype=float, sep=" "),
- np.array([1., 2., 3.]))
+ with assert_warns(DeprecationWarning):
+ assert_equal(np.fromstring("1. 2. 3. flop 4.", dtype=float, sep=" "),
+ np.array([1., 2., 3.]))
def test_fromstring_empty():
- assert_equal(np.fromstring("xxxxx", sep="x"),
- np.array([]))
+ with assert_warns(DeprecationWarning):
+ assert_equal(np.fromstring("xxxxx", sep="x"),
+ np.array([]))
def test_fromstring_missing():
- assert_equal(np.fromstring("1xx3x4x5x6", sep="x"),
- np.array([1]))
+ with assert_warns(DeprecationWarning):
+ assert_equal(np.fromstring("1xx3x4x5x6", sep="x"),
+ np.array([1]))
class TestFileBased(object):
@@ -95,7 +99,9 @@ class TestFileBased(object):
with temppath() as path:
with open(path, 'wt') as f:
f.write("1. 2. 3. flop 4.\n")
- res = np.fromfile(path, dtype=float, sep=" ")
+
+ with assert_warns(DeprecationWarning):
+ res = np.fromfile(path, dtype=float, sep=" ")
assert_equal(res, np.array([1., 2., 3.]))
@pytest.mark.skipif(string_to_longdouble_inaccurate,
@@ -186,12 +192,14 @@ class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
assert_equal(a[0], f)
def test_fromstring_best_effort_float(self):
- assert_equal(np.fromstring("1,234", dtype=float, sep=" "),
- np.array([1.]))
+ with assert_warns(DeprecationWarning):
+ assert_equal(np.fromstring("1,234", dtype=float, sep=" "),
+ np.array([1.]))
def test_fromstring_best_effort(self):
- assert_equal(np.fromstring("1,234", dtype=np.longdouble, sep=" "),
- np.array([1.]))
+ with assert_warns(DeprecationWarning):
+ assert_equal(np.fromstring("1,234", dtype=np.longdouble, sep=" "),
+ np.array([1.]))
def test_fromstring_foreign(self):
s = "1.234"
@@ -204,8 +212,10 @@ class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
assert_array_equal(a, b)
def test_fromstring_foreign_value(self):
- b = np.fromstring("1,234", dtype=np.longdouble, sep=" ")
- assert_array_equal(b[0], 1)
+ with assert_warns(DeprecationWarning):
+ b = np.fromstring("1,234", dtype=np.longdouble, sep=" ")
+ assert_array_equal(b[0], 1)
+
@pytest.mark.parametrize("int_val", [
# cases discussed in gh-10723
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 05f01ad82..58572f268 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4963,7 +4963,8 @@ class TestIO(object):
self._check_from(b'1,2,3,4', [1., 2., 3., 4.], dtype=float, sep=',')
def test_malformed(self):
- self._check_from(b'1.234 1,234', [1.234, 1.], sep=' ')
+ with assert_warns(DeprecationWarning):
+ self._check_from(b'1.234 1,234', [1.234, 1.], sep=' ')
def test_long_sep(self):
self._check_from(b'1_x_3_x_4_x_5', [1, 3, 4, 5], sep='_x_')
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index cbd4ceafc..9dc231deb 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -1539,7 +1539,8 @@ class TestRegression(object):
def test_fromstring_crash(self):
# Ticket #1345: the following should not cause a crash
- np.fromstring(b'aa, aa, 1.0', sep=',')
+ with assert_warns(DeprecationWarning):
+ np.fromstring(b'aa, aa, 1.0', sep=',')
def test_ticket_1539(self):
dtypes = [x for x in np.typeDict.values()