BUG: (loadtxt) Ignore last empty field when `delimiter=None`

When the delimiter is None and we are splitting on whitespace, a final empty field is always ignored to match the behaviour of pythons split: `" 1 ".split()`. Closes gh-21052
author: Sebastian Berg <sebastian@sipsolutions.net> 2022-02-13 17:15:44 -0600
committer: Sebastian Berg <sebastian@sipsolutions.net> 2022-02-14 07:57:01 -0600
commit: 67bb936e8d8bc29c1dc4e54449a173280d43b9dd (patch)
tree: 4aa2caed8b90214743eba2e5e4b170677b5bbba8
parent: ee1722d287fb32a724ac3f5807731e36cbbc5567 (diff)
download: numpy-67bb936e8d8bc29c1dc4e54449a173280d43b9dd.tar.gz
2 files changed, 19 insertions, 6 deletions
diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp
index 57bee1cdd..7cd3889cc 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.cpp
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -378,13 +378,22 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
     ts->num_fields -= 1;
 
     /*
-     * If have one field, but that field is completely empty, this is an
-     * empty line, and we just ignore it.
+     * We always start a new field (at the very beginning and whenever a
+     * delimiter was found).
+     * This gives us two scenarios where we need to ignore the last field
+     * if it is empty:
+     * 1. If there is exactly one empty (unquoted) field, the whole line is
+     *    empty.
+     * 2. If we are splitting on whitespace we always ignore a last empty
+     *    field to match Python's splitting: `" 1 ".split()`.
      */
     if (ts->num_fields == 1
-             && ts->fields[1].offset - ts->fields[0].offset == 1
-             && !ts->fields->quoted) {
-        ts->num_fields--;
+            || ts->unquoted_state == TOKENIZE_UNQUOTED_WHITESPACE) {
+        ssize_t offset_last = ts->fields[ts->num_fields-1].offset;
+        ssize_t end_last = ts->fields[ts->num_fields].offset;
+        if (!ts->fields->quoted && end_last - offset_last == 1) {
+            ts->num_fields--;
+        }
     }
     ts->state = TOKENIZE_INIT;
     return finished_reading_file;
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
index cca328b16..f50e3b8ad 100644
--- a/numpy/lib/tests/test_loadtxt.py
+++ b/numpy/lib/tests/test_loadtxt.py
@@ -165,6 +165,7 @@ def test_bad_ndmin(badval):
 @pytest.mark.parametrize(
     "ws",
     (
+            " ",  # space
             "\t",  # tab
             "\u2003",  # em
             "\u00A0",  # non-break
@@ -173,7 +174,10 @@ def test_bad_ndmin(badval):
 )
 def test_blank_lines_spaces_delimit(ws):
     txt = StringIO(
-        f"1 2{ws}30\n\n4 5 60\n  {ws}  \n7 8 {ws} 90\n  # comment\n3 2 1"
+        f"1 2{ws}30\n\n{ws}\n"
+        f"4 5 60{ws}\n  {ws}  \n"
+        f"7 8 {ws} 90\n  # comment\n"
+        f"3 2 1"
     )
     # NOTE: It is unclear that the `  # comment` should succeed. Except
     #       for delimiter=None, which should use any whitespace (and maybe
author	Sebastian Berg <sebastian@sipsolutions.net>	2022-02-13 17:15:44 -0600
committer	Sebastian Berg <sebastian@sipsolutions.net>	2022-02-14 07:57:01 -0600
commit	67bb936e8d8bc29c1dc4e54449a173280d43b9dd (patch)
tree	4aa2caed8b90214743eba2e5e4b170677b5bbba8
parent	ee1722d287fb32a724ac3f5807731e36cbbc5567 (diff)
download	numpy-67bb936e8d8bc29c1dc4e54449a173280d43b9dd.tar.gz