summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2022-02-14 21:56:04 +0200
committerGitHub <noreply@github.com>2022-02-14 21:56:04 +0200
commit5eab6434dd3646f1df78d38454f316e84723dc4f (patch)
treee3d3046d1f701b0859dd424c51b18d9a6d97e7c0
parent91d49731cb5611135dc8810a2990e0acca34fdaa (diff)
parent67bb936e8d8bc29c1dc4e54449a173280d43b9dd (diff)
downloadnumpy-5eab6434dd3646f1df78d38454f316e84723dc4f.tar.gz
Merge pull request #21054 from seberg/fixup-no-delimiter
BUG: (loadtxt) Ignore last empty field when `delimiter=None`
-rw-r--r--numpy/core/src/multiarray/textreading/tokenize.cpp19
-rw-r--r--numpy/lib/tests/test_loadtxt.py6
2 files changed, 19 insertions, 6 deletions
diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp
index 57bee1cdd..7cd3889cc 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.cpp
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -378,13 +378,22 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
ts->num_fields -= 1;
/*
- * If have one field, but that field is completely empty, this is an
- * empty line, and we just ignore it.
+ * We always start a new field (at the very beginning and whenever a
+ * delimiter was found).
+ * This gives us two scenarios where we need to ignore the last field
+ * if it is empty:
+ * 1. If there is exactly one empty (unquoted) field, the whole line is
+ * empty.
+ * 2. If we are splitting on whitespace we always ignore a last empty
+ * field to match Python's splitting: `" 1 ".split()`.
*/
if (ts->num_fields == 1
- && ts->fields[1].offset - ts->fields[0].offset == 1
- && !ts->fields->quoted) {
- ts->num_fields--;
+ || ts->unquoted_state == TOKENIZE_UNQUOTED_WHITESPACE) {
+ ssize_t offset_last = ts->fields[ts->num_fields-1].offset;
+ ssize_t end_last = ts->fields[ts->num_fields].offset;
+ if (!ts->fields->quoted && end_last - offset_last == 1) {
+ ts->num_fields--;
+ }
}
ts->state = TOKENIZE_INIT;
return finished_reading_file;
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
index cca328b16..f50e3b8ad 100644
--- a/numpy/lib/tests/test_loadtxt.py
+++ b/numpy/lib/tests/test_loadtxt.py
@@ -165,6 +165,7 @@ def test_bad_ndmin(badval):
@pytest.mark.parametrize(
"ws",
(
+ " ", # space
"\t", # tab
"\u2003", # em
"\u00A0", # non-break
@@ -173,7 +174,10 @@ def test_bad_ndmin(badval):
)
def test_blank_lines_spaces_delimit(ws):
txt = StringIO(
- f"1 2{ws}30\n\n4 5 60\n {ws} \n7 8 {ws} 90\n # comment\n3 2 1"
+ f"1 2{ws}30\n\n{ws}\n"
+ f"4 5 60{ws}\n {ws} \n"
+ f"7 8 {ws} 90\n # comment\n"
+ f"3 2 1"
)
# NOTE: It is unclear that the ` # comment` should succeed. Except
# for delimiter=None, which should use any whitespace (and maybe