Merge pull request #13605 from seberg/fromfile-eof-deprecation

DEP: Deprecate silent ignoring of bad data in fromfile/fromstring
author: Matti Picus <matti.picus@gmail.com> 2019-09-13 09:25:21 +0300
committer: GitHub <noreply@github.com> 2019-09-13 09:25:21 +0300
commit: 79ee678fb2f1518b3f45e26066a02b45b22d456c (patch)
tree: e8b8f529c7cfcf9451e8b12655291a5d7279e2e5
parent: e97975a2027e81d93ccdc96ab767653773b3cc04 (diff)
parent: bf6809a63238dd50b2a42a7d62824f1e438a222a (diff)
download: numpy-79ee678fb2f1518b3f45e26066a02b45b22d456c.tar.gz
7 files changed, 183 insertions, 30 deletions
diff --git a/changelog/13605.deprecation.rst b/changelog/13605.deprecation.rst
new file mode 100644
index 000000000..bff12e965
--- /dev/null
+++ b/changelog/13605.deprecation.rst
@@ -0,0 +1,9 @@
+`np.fromfile` and `np.fromstring` will error on bad data
+--------------------------------------------------------
+
+In future numpy releases, the functions `np.fromfile` and `np.fromstring`
+will throw an error when parsing bad data.
+This will now give a ``DeprecationWarning`` where previously partial or
+even invalid data was silently returned. This deprecation also affects
+the C defined functions c:func:`PyArray_FromString`` and
+c:func:`PyArray_FromFile`
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index 1365e87bb..b0985c80f 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -593,6 +593,25 @@ fail:
     return NULL;
 }
 
+/*
+ * Helper to test fromstring of 0 terminated strings, as the C-API supports
+ * the -1 length identifier.
+ */
+static PyObject *
+fromstring_null_term_c_api(PyObject *dummy, PyObject *byte_obj)
+{
+    char *string;
+
+    string = PyBytes_AsString(byte_obj);
+    if (string == NULL) {
+        return NULL;
+    }
+
+    return PyArray_FromString(
+                    string, -1, PyArray_DescrFromType(NPY_FLOAT64), -1, " ");
+}
+
+
 /* check no elison for avoided increfs */
 static PyObject *
 incref_elide(PyObject *dummy, PyObject *args)
@@ -1927,6 +1946,9 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"test_inplace_increment",
         inplace_increment,
         METH_VARARGS, NULL},
+    {"fromstring_null_term_c_api",
+        fromstring_null_term_c_api,
+        METH_O, NULL},
     {"incref_elide",
         incref_elide,
         METH_VARARGS, NULL},
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 50d58ac16..188e68001 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -40,9 +40,31 @@
  * regards to the handling of text representations.
  */
 
+/*
+ * Scanning function for next element parsing and seperator skipping.
+ * These functions return:
+ *   - 0 to indicate more data to read
+ *   - -1 when reading stopped at the end of the string/file
+ *   - -2 when reading stopped before the end was reached.
+ *
+ * The dtype specific parsing functions may set the python error state
+ * (they have to get the GIL first) additionally.
+ */
 typedef int (*next_element)(void **, void *, PyArray_Descr *, void *);
 typedef int (*skip_separator)(void **, const char *, void *);
 
+
+static npy_bool
+string_is_fully_read(char const* start, char const* end) {
+    if (end == NULL) {
+        return *start == '\0';  /* null terminated */
+    }
+    else {
+        return start >= end;  /* fixed length */
+    }
+}
+
+
 static int
 fromstr_next_element(char **s, void *dptr, PyArray_Descr *dtype,
                      const char *end)
@@ -50,19 +72,23 @@ fromstr_next_element(char **s, void *dptr, PyArray_Descr *dtype,
     char *e = *s;
     int r = dtype->f->fromstr(*s, dptr, &e, dtype);
     /*
-     * fromstr always returns 0 for basic dtypes
-     * s points to the end of the parsed string
-     * if an error occurs s is not changed
+     * fromstr always returns 0 for basic dtypes; s points to the end of the
+     * parsed string. If s is not changed an error occurred or the end was
+     * reached.
      */
-    if (*s == e) {
-        /* Nothing read */
-        return -1;
+    if (*s == e || r < 0) {
+        /* Nothing read, could be end of string or an error (or both) */
+        if (string_is_fully_read(*s, end)) {
+            return -1;
+        }
+        return -2;
     }
     *s = e;
     if (end != NULL && *s > end) {
+        /* Stop the iteration if we read far enough */
         return -1;
     }
-    return r;
+    return 0;
 }
 
 static int
@@ -75,9 +101,13 @@ fromfile_next_element(FILE **fp, void *dptr, PyArray_Descr *dtype,
     if (r == 1) {
         return 0;
     }
-    else {
+    else if (r == EOF) {
         return -1;
     }
+    else {
+        /* unable to read more, but EOF not reached indicating an error. */
+        return -2;
+    }
 }
 
 /*
@@ -143,9 +173,10 @@ fromstr_skip_separator(char **s, const char *sep, const char *end)
 {
     char *string = *s;
     int result = 0;
+
     while (1) {
         char c = *string;
-        if (c == '\0' || (end != NULL && string >= end)) {
+        if (string_is_fully_read(string, end)) {
             result = -1;
             break;
         }
@@ -3596,6 +3627,7 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
     npy_intp i;
     char *dptr, *clean_sep, *tmp;
     int err = 0;
+    int stop_reading_flag;  /* -1 indicates end reached; -2 a parsing error */
     npy_intp thisbuf = 0;
     npy_intp size;
     npy_intp bytes, totalbytes;
@@ -3623,9 +3655,9 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
     NPY_BEGIN_ALLOW_THREADS;
     totalbytes = bytes = size * dtype->elsize;
     dptr = PyArray_DATA(r);
-    for (i= 0; num < 0 || i < num; i++) {
-        if (next(&stream, dptr, dtype, stream_data) < 0) {
-            /* EOF */
+    for (i = 0; num < 0 || i < num; i++) {
+        stop_reading_flag = next(&stream, dptr, dtype, stream_data);
+        if (stop_reading_flag < 0) {
             break;
         }
         *nread += 1;
@@ -3642,7 +3674,12 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
             dptr = tmp + (totalbytes - bytes);
             thisbuf = 0;
         }
-        if (skip_sep(&stream, clean_sep, stream_data) < 0) {
+        stop_reading_flag = skip_sep(&stream, clean_sep, stream_data);
+        if (stop_reading_flag < 0) {
+            if (num == i + 1) {
+                /* if we read as much as requested sep is optional */
+                stop_reading_flag = -1;
+            }
             break;
         }
     }
@@ -3661,6 +3698,21 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
         }
     }
     NPY_END_ALLOW_THREADS;
+
+    if (stop_reading_flag == -2) {
+        if (PyErr_Occurred()) {
+            /* If an error is already set (unlikely), do not create new one */
+            Py_DECREF(r);
+            return NULL;
+        }
+        /* 2019-09-12, NumPy 1.18 */
+        if (DEPRECATE(
+                "string or file could not be read to its end due to unmatched "
+                "data; this will raise a ValueError in the future.") < 0) {
+            goto fail;
+        }
+    }
+
     free(clean_sep);
 
 fail:
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index e8aa0c70b..46cebdd31 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -10,12 +10,16 @@ import sys
 import operator
 import warnings
 import pytest
+import shutil
+import tempfile
 
 import numpy as np
 from numpy.testing import (
-    assert_raises, assert_warns, assert_
+    assert_raises, assert_warns, assert_, assert_array_equal
     )
 
+from numpy.core._multiarray_tests import fromstring_null_term_c_api
+
 try:
     import pytz
     _has_pytz = True
@@ -514,11 +518,65 @@ class TestPositiveOnNonNumerical(_DeprecationTestCase):
     def test_positive_on_non_number(self):
         self.assert_deprecated(operator.pos, args=(np.array('foo'),))
 
+
 class TestFromstring(_DeprecationTestCase):
     # 2017-10-19, 1.14
     def test_fromstring(self):
         self.assert_deprecated(np.fromstring, args=('\x00'*80,))
 
+
+class TestFromStringAndFileInvalidData(_DeprecationTestCase):
+    # 2019-06-08, 1.17.0
+    # Tests should be moved to real tests when deprecation is done.
+    message = "string or file could not be read to its end"
+
+    @pytest.mark.parametrize("invalid_str", [",invalid_data", "invalid_sep"])
+    def test_deprecate_unparsable_data_file(self, invalid_str):
+        x = np.array([1.51, 2, 3.51, 4], dtype=float)
+
+        with tempfile.TemporaryFile(mode="w") as f:
+            x.tofile(f, sep=',', format='%.2f')
+            f.write(invalid_str)
+
+            f.seek(0)
+            self.assert_deprecated(lambda: np.fromfile(f, sep=","))
+            f.seek(0)
+            self.assert_deprecated(lambda: np.fromfile(f, sep=",", count=5))
+            # Should not raise:
+            with warnings.catch_warnings():
+                warnings.simplefilter("error", DeprecationWarning)
+                f.seek(0)
+                res = np.fromfile(f, sep=",", count=4)
+                assert_array_equal(res, x)
+
+    @pytest.mark.parametrize("invalid_str", [",invalid_data", "invalid_sep"])
+    def test_deprecate_unparsable_string(self, invalid_str):
+        x = np.array([1.51, 2, 3.51, 4], dtype=float)
+        x_str = "1.51,2,3.51,4{}".format(invalid_str)
+
+        self.assert_deprecated(lambda: np.fromstring(x_str, sep=","))
+        self.assert_deprecated(lambda: np.fromstring(x_str, sep=",", count=5))
+
+        # The C-level API can use not fixed size, but 0 terminated strings,
+        # so test that as well:
+        bytestr = x_str.encode("ascii")
+        self.assert_deprecated(lambda: fromstring_null_term_c_api(bytestr))
+
+        with assert_warns(DeprecationWarning):
+            # this is slightly strange, in that fromstring leaves data
+            # potentially uninitialized (would be good to error when all is
+            # read, but count is larger then actual data maybe).
+            res = np.fromstring(x_str, sep=",", count=5)
+            assert_array_equal(res[:-1], x)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", DeprecationWarning)
+
+            # Should not raise:
+            res = np.fromstring(x_str, sep=",", count=4)
+            assert_array_equal(res, x)
+
+
 class Test_GetSet_NumericOps(_DeprecationTestCase):
     # 2018-09-20, 1.16.0
     def test_get_numeric_ops(self):
diff --git a/numpy/core/tests/test_longdouble.py b/numpy/core/tests/test_longdouble.py
index ee4197f8f..59ac5923c 100644
--- a/numpy/core/tests/test_longdouble.py
+++ b/numpy/core/tests/test_longdouble.py
@@ -5,7 +5,8 @@ import pytest
 
 import numpy as np
 from numpy.testing import (
-    assert_, assert_equal, assert_raises, assert_array_equal, temppath,
+    assert_, assert_equal, assert_raises, assert_warns, assert_array_equal,
+    temppath,
     )
 from numpy.core.tests._locales import CommaDecimalPointLocale
 
@@ -71,18 +72,21 @@ def test_fromstring():
 
 
 def test_fromstring_bogus():
-    assert_equal(np.fromstring("1. 2. 3. flop 4.", dtype=float, sep=" "),
-                 np.array([1., 2., 3.]))
+    with assert_warns(DeprecationWarning):
+        assert_equal(np.fromstring("1. 2. 3. flop 4.", dtype=float, sep=" "),
+                     np.array([1., 2., 3.]))
 
 
 def test_fromstring_empty():
-    assert_equal(np.fromstring("xxxxx", sep="x"),
-                 np.array([]))
+    with assert_warns(DeprecationWarning):
+        assert_equal(np.fromstring("xxxxx", sep="x"),
+                     np.array([]))
 
 
 def test_fromstring_missing():
-    assert_equal(np.fromstring("1xx3x4x5x6", sep="x"),
-                 np.array([1]))
+    with assert_warns(DeprecationWarning):
+        assert_equal(np.fromstring("1xx3x4x5x6", sep="x"),
+                     np.array([1]))
 
 
 class TestFileBased(object):
@@ -95,7 +99,9 @@ class TestFileBased(object):
         with temppath() as path:
             with open(path, 'wt') as f:
                 f.write("1. 2. 3. flop 4.\n")
-            res = np.fromfile(path, dtype=float, sep=" ")
+
+            with assert_warns(DeprecationWarning):
+                res = np.fromfile(path, dtype=float, sep=" ")
         assert_equal(res, np.array([1., 2., 3.]))
 
     @pytest.mark.skipif(string_to_longdouble_inaccurate,
@@ -186,12 +192,14 @@ class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
         assert_equal(a[0], f)
 
     def test_fromstring_best_effort_float(self):
-        assert_equal(np.fromstring("1,234", dtype=float, sep=" "),
-                     np.array([1.]))
+        with assert_warns(DeprecationWarning):
+            assert_equal(np.fromstring("1,234", dtype=float, sep=" "),
+                         np.array([1.]))
 
     def test_fromstring_best_effort(self):
-        assert_equal(np.fromstring("1,234", dtype=np.longdouble, sep=" "),
-                     np.array([1.]))
+        with assert_warns(DeprecationWarning):
+            assert_equal(np.fromstring("1,234", dtype=np.longdouble, sep=" "),
+                         np.array([1.]))
 
     def test_fromstring_foreign(self):
         s = "1.234"
@@ -204,8 +212,10 @@ class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
         assert_array_equal(a, b)
 
     def test_fromstring_foreign_value(self):
-        b = np.fromstring("1,234", dtype=np.longdouble, sep=" ")
-        assert_array_equal(b[0], 1)
+        with assert_warns(DeprecationWarning):
+            b = np.fromstring("1,234", dtype=np.longdouble, sep=" ")
+            assert_array_equal(b[0], 1)
+
 
 @pytest.mark.parametrize("int_val", [
     # cases discussed in gh-10723
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 05f01ad82..58572f268 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4963,7 +4963,8 @@ class TestIO(object):
         self._check_from(b'1,2,3,4', [1., 2., 3., 4.], dtype=float, sep=',')
 
     def test_malformed(self):
-        self._check_from(b'1.234 1,234', [1.234, 1.], sep=' ')
+        with assert_warns(DeprecationWarning):
+            self._check_from(b'1.234 1,234', [1.234, 1.], sep=' ')
 
     def test_long_sep(self):
         self._check_from(b'1_x_3_x_4_x_5', [1, 3, 4, 5], sep='_x_')
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index cbd4ceafc..9dc231deb 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -1539,7 +1539,8 @@ class TestRegression(object):
 
     def test_fromstring_crash(self):
         # Ticket #1345: the following should not cause a crash
-        np.fromstring(b'aa, aa, 1.0', sep=',')
+        with assert_warns(DeprecationWarning):
+            np.fromstring(b'aa, aa, 1.0', sep=',')
 
     def test_ticket_1539(self):
         dtypes = [x for x in np.typeDict.values()
author	Matti Picus <matti.picus@gmail.com>	2019-09-13 09:25:21 +0300
committer	GitHub <noreply@github.com>	2019-09-13 09:25:21 +0300
commit	79ee678fb2f1518b3f45e26066a02b45b22d456c (patch)
tree	e8b8f529c7cfcf9451e8b12655291a5d7279e2e5
parent	e97975a2027e81d93ccdc96ab767653773b3cc04 (diff)
parent	bf6809a63238dd50b2a42a7d62824f1e438a222a (diff)
download	numpy-79ee678fb2f1518b3f45e26066a02b45b22d456c.tar.gz