BUG: Fix complex parser and add tests for whitespace and failure paths

author: Sebastian Berg <sebastian@sipsolutions.net> 2022-01-11 15:46:21 -0600
committer: Sebastian Berg <sebastian@sipsolutions.net> 2022-01-14 20:07:07 -0600
commit: e1f7ad16518f95b6c5b560a03375b4329c8136ff (patch)
tree: b3b7319c474a99997f82a7a7eb175f7b906ff106
parent: 3ca9f5a2a252e020a44a355f4fc8114d91ea3423 (diff)
download: numpy-e1f7ad16518f95b6c5b560a03375b4329c8136ff.tar.gz
2 files changed, 60 insertions, 19 deletions
diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c
index 04060baa1..8c685ea64 100644
--- a/numpy/core/src/multiarray/textreading/conversions.c
+++ b/numpy/core/src/multiarray/textreading/conversions.c
@@ -46,11 +46,13 @@ to_bool(PyArray_Descr *NPY_UNUSED(descr),
 static NPY_INLINE int
 double_from_ucs4(
         const Py_UCS4 *str, const Py_UCS4 *end,
-        bool skip_trailing_whitespace, double *result, const Py_UCS4 **p_end)
+        bool strip_whitespace, double *result, const Py_UCS4 **p_end)
 {
     /* skip leading whitespace */
-    while (Py_UNICODE_ISSPACE(*str)) {
-        str++;
+    if (strip_whitespace) {
+        while (Py_UNICODE_ISSPACE(*str)) {
+            str++;
+        }
     }
     if (str == end) {
         return -1;  /* empty or only whitespace: not a floating point number */
@@ -69,7 +71,9 @@ double_from_ucs4(
     char *c = ascii;
     for (; str < end; str++, c++) {
         if (NPY_UNLIKELY(*str >= 128)) {
-            break;  /* the following cannot be a number anymore */
+            /* Character cannot be used, ignore for end calculation and stop */
+            end = str;
+            break;
         }
         *c = (char)(*str);
     }
@@ -86,7 +90,7 @@ double_from_ucs4(
         return -1;
     }
 
-    if (skip_trailing_whitespace) {
+    if (strip_whitespace) {
         /* and then skip any remainig whitespace: */
         while (Py_UNICODE_ISSPACE(*end)) {
             end++;
@@ -158,6 +162,10 @@ to_complex_int(
     if (allow_parens && (*item == '(')) {
         unmatched_opening_paren = true;
         ++item;
+        /* Allow whitespace within the parentheses: "( 1j)" */
+        while (Py_UNICODE_ISSPACE(*item)) {
+            ++item;
+        }
     }
     if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) {
         return false;
@@ -168,23 +176,15 @@ to_complex_int(
         return !unmatched_opening_paren;
     }
     if (*p_end == imaginary_unit) {
-        // Pure imaginary part only (e.g "1.5j")
+        /* Only an imaginary part (e.g "1.5j") */
         *p_imag = *p_real;
         *p_real = 0.0;
         ++p_end;
-        if (unmatched_opening_paren && (*p_end == ')')) {
-            ++p_end;
-            unmatched_opening_paren = false;
-        }
-    }
-    else if (unmatched_opening_paren && (*p_end == ')')) {
-        *p_imag = 0.0;
-        ++p_end;
-        unmatched_opening_paren = false;
     }
-    else {
+    else if (*p_end == '+' || *p_end == '-') {
+        /* Imaginary part still to parse */
         if (*p_end == '+') {
-            ++p_end;
+            ++p_end;  /* Advance to support +- (and ++) */
         }
         if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) {
             return false;
@@ -193,11 +193,25 @@ to_complex_int(
             return false;
         }
         ++p_end;
-        if (unmatched_opening_paren && (*p_end == ')')) {
+    }
+    else {
+        *p_imag = 0;
+    }
+
+    if (unmatched_opening_paren) {
+        /* Allow whitespace inside brackets as in "(1+2j )" or "( 1j )" */
+        while (Py_UNICODE_ISSPACE(*p_end)) {
+            ++p_end;
+        }
+        if (*p_end == ')') {
             ++p_end;
-            unmatched_opening_paren = false;
+        }
+        else {
+            /* parentheses was not closed */
+            return false;
         }
     }
+
     while (Py_UNICODE_ISSPACE(*p_end)) {
         ++p_end;
     }
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index b4ca5b74b..5ba852e3d 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -3263,3 +3263,30 @@ def test_loadtxt_warn_on_skipped_data(skiprows):
     txt = TextIO(data)
     with pytest.warns(UserWarning, match="input contained no data"):
         np.loadtxt(txt, skiprows=skiprows)
+
+
+@pytest.mark.parametrize("dtype",
+        np.typecodes["AllInteger"] + "efdFD" + "?")
+def test_loadtxt_unicode_whitespace_stripping(dtype):
+    # Test that all numeric types (and bool) strip whitespace correctly
+    # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted.
+    # Currently, skip float128 as it did not always support this and has no
+    # "custom" parsing:
+    txt = StringIO(' 3 ,"\u202F2\n"')
+    res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"')
+    assert_array_equal(res, np.array([3, 2]).astype(dtype))
+
+@pytest.mark.parametrize("dtype", "FD")
+def test_loadtxt_unicode_whitespace_stripping_complex(dtype):
+    # Complex has a few extra cases since it has two components and parentheses
+    line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j )  , 8j , ( 9j ) \n"
+    data = [line, line.replace(" ", "\u202F")]
+    res = np.loadtxt(data, dtype=dtype, delimiter=',')
+    assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2))
+
+@pytest.mark.parametrize("dtype", "FD")
+@pytest.mark.parametrize("field",
+        ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"])
+def test_loadtxt_bad_complex(dtype, field):
+    with pytest.raises(ValueError):
+        np.loadtxt([field + "\n"], dtype=dtype, delimiter=",")
author	Sebastian Berg <sebastian@sipsolutions.net>	2022-01-11 15:46:21 -0600
committer	Sebastian Berg <sebastian@sipsolutions.net>	2022-01-14 20:07:07 -0600
commit	e1f7ad16518f95b6c5b560a03375b4329c8136ff (patch)
tree	b3b7319c474a99997f82a7a7eb175f7b906ff106
parent	3ca9f5a2a252e020a44a355f4fc8114d91ea3423 (diff)
download	numpy-e1f7ad16518f95b6c5b560a03375b4329c8136ff.tar.gz