summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Berg <sebastian@sipsolutions.net>2022-01-11 15:46:21 -0600
committerSebastian Berg <sebastian@sipsolutions.net>2022-01-14 20:07:07 -0600
commite1f7ad16518f95b6c5b560a03375b4329c8136ff (patch)
treeb3b7319c474a99997f82a7a7eb175f7b906ff106
parent3ca9f5a2a252e020a44a355f4fc8114d91ea3423 (diff)
downloadnumpy-e1f7ad16518f95b6c5b560a03375b4329c8136ff.tar.gz
BUG: Fix complex parser and add tests for whitespace and failure paths
-rw-r--r--numpy/core/src/multiarray/textreading/conversions.c52
-rw-r--r--numpy/lib/tests/test_io.py27
2 files changed, 60 insertions, 19 deletions
diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c
index 04060baa1..8c685ea64 100644
--- a/numpy/core/src/multiarray/textreading/conversions.c
+++ b/numpy/core/src/multiarray/textreading/conversions.c
@@ -46,11 +46,13 @@ to_bool(PyArray_Descr *NPY_UNUSED(descr),
static NPY_INLINE int
double_from_ucs4(
const Py_UCS4 *str, const Py_UCS4 *end,
- bool skip_trailing_whitespace, double *result, const Py_UCS4 **p_end)
+ bool strip_whitespace, double *result, const Py_UCS4 **p_end)
{
/* skip leading whitespace */
- while (Py_UNICODE_ISSPACE(*str)) {
- str++;
+ if (strip_whitespace) {
+ while (Py_UNICODE_ISSPACE(*str)) {
+ str++;
+ }
}
if (str == end) {
return -1; /* empty or only whitespace: not a floating point number */
@@ -69,7 +71,9 @@ double_from_ucs4(
char *c = ascii;
for (; str < end; str++, c++) {
if (NPY_UNLIKELY(*str >= 128)) {
- break; /* the following cannot be a number anymore */
+ /* Character cannot be used, ignore for end calculation and stop */
+ end = str;
+ break;
}
*c = (char)(*str);
}
@@ -86,7 +90,7 @@ double_from_ucs4(
return -1;
}
- if (skip_trailing_whitespace) {
+ if (strip_whitespace) {
/* and then skip any remainig whitespace: */
while (Py_UNICODE_ISSPACE(*end)) {
end++;
@@ -158,6 +162,10 @@ to_complex_int(
if (allow_parens && (*item == '(')) {
unmatched_opening_paren = true;
++item;
+ /* Allow whitespace within the parentheses: "( 1j)" */
+ while (Py_UNICODE_ISSPACE(*item)) {
+ ++item;
+ }
}
if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) {
return false;
@@ -168,23 +176,15 @@ to_complex_int(
return !unmatched_opening_paren;
}
if (*p_end == imaginary_unit) {
- // Pure imaginary part only (e.g "1.5j")
+ /* Only an imaginary part (e.g "1.5j") */
*p_imag = *p_real;
*p_real = 0.0;
++p_end;
- if (unmatched_opening_paren && (*p_end == ')')) {
- ++p_end;
- unmatched_opening_paren = false;
- }
- }
- else if (unmatched_opening_paren && (*p_end == ')')) {
- *p_imag = 0.0;
- ++p_end;
- unmatched_opening_paren = false;
}
- else {
+ else if (*p_end == '+' || *p_end == '-') {
+ /* Imaginary part still to parse */
if (*p_end == '+') {
- ++p_end;
+ ++p_end; /* Advance to support +- (and ++) */
}
if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) {
return false;
@@ -193,11 +193,25 @@ to_complex_int(
return false;
}
++p_end;
- if (unmatched_opening_paren && (*p_end == ')')) {
+ }
+ else {
+ *p_imag = 0;
+ }
+
+ if (unmatched_opening_paren) {
+ /* Allow whitespace inside brackets as in "(1+2j )" or "( 1j )" */
+ while (Py_UNICODE_ISSPACE(*p_end)) {
+ ++p_end;
+ }
+ if (*p_end == ')') {
++p_end;
- unmatched_opening_paren = false;
+ }
+ else {
+ /* parentheses was not closed */
+ return false;
}
}
+
while (Py_UNICODE_ISSPACE(*p_end)) {
++p_end;
}
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index b4ca5b74b..5ba852e3d 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -3263,3 +3263,30 @@ def test_loadtxt_warn_on_skipped_data(skiprows):
txt = TextIO(data)
with pytest.warns(UserWarning, match="input contained no data"):
np.loadtxt(txt, skiprows=skiprows)
+
+
+@pytest.mark.parametrize("dtype",
+ np.typecodes["AllInteger"] + "efdFD" + "?")
+def test_loadtxt_unicode_whitespace_stripping(dtype):
+ # Test that all numeric types (and bool) strip whitespace correctly
+ # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted.
+ # Currently, skip float128 as it did not always support this and has no
+ # "custom" parsing:
+ txt = StringIO(' 3 ,"\u202F2\n"')
+ res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"')
+ assert_array_equal(res, np.array([3, 2]).astype(dtype))
+
+@pytest.mark.parametrize("dtype", "FD")
+def test_loadtxt_unicode_whitespace_stripping_complex(dtype):
+ # Complex has a few extra cases since it has two components and parentheses
+ line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n"
+ data = [line, line.replace(" ", "\u202F")]
+ res = np.loadtxt(data, dtype=dtype, delimiter=',')
+ assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2))
+
+@pytest.mark.parametrize("dtype", "FD")
+@pytest.mark.parametrize("field",
+ ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"])
+def test_loadtxt_bad_complex(dtype, field):
+ with pytest.raises(ValueError):
+ np.loadtxt([field + "\n"], dtype=dtype, delimiter=",")