BUG: Ensure correct behavior for rows ending in delimiter in loadtxt (#22836)

If a row ends in a delimiter, `add_fields` can be called twice without any field actually being parsed. This causes issues with the field buffer setup. closes gh-22833
author: Sebastian Berg <sebastianb@nvidia.com> 2022-12-19 19:40:20 +0100
committer: GitHub <noreply@github.com> 2022-12-19 10:40:20 -0800
commit: 5e0ed03b6b7a6ff05843d846eedac730f20decb7 (patch)
tree: b96c2745c2354964ca854e5f5169d69dee0112bc
parent: ca257565537483ff3b37ab40013f2b7f344e968d (diff)
download: numpy-5e0ed03b6b7a6ff05843d846eedac730f20decb7.tar.gz
3 files changed, 18 insertions, 3 deletions
diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp
index 02c64263e..ff7e7a8c1 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.cpp
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -45,7 +45,8 @@ copy_to_field_buffer(tokenizer_state *ts,
         const UCS *chunk_start, const UCS *chunk_end)
 {
     npy_intp chunk_length = chunk_end - chunk_start;
-    npy_intp size = chunk_length + ts->field_buffer_pos + 2;
+    /* Space for length +1 termination, +2 additional padding for add_field */
+    npy_intp size = chunk_length + ts->field_buffer_pos + 3;
 
     if (NPY_UNLIKELY(ts->field_buffer_length < size)) {
         npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4));
@@ -104,6 +105,7 @@ add_field(tokenizer_state *ts)
     ts->num_fields += 1;
     /* Ensure this (currently empty) word is NUL terminated. */
     ts->field_buffer[ts->field_buffer_pos] = '\0';
+    assert(ts->field_buffer_length > ts->field_buffer_pos);
     return 0;
 }
 
diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h
index d0ea46383..53e97760f 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.h
+++ b/numpy/core/src/multiarray/textreading/tokenize.h
@@ -46,8 +46,9 @@ typedef struct {
     char *pos;
     char *end;
     /*
-     * Space to copy words into.  The buffer must always be at least two NUL
-     * entries longer (8 bytes) than the actual word (including initially).
+     * Space to copy words into.  Due to `add_field` not growing the buffer
+     * but writing a \0 termination, the buffer must always be two larger
+     * (add_field can be called twice if a row ends in a delimiter: "123,").
      * The first byte beyond the current word is always NUL'ed on write, the
      * second byte is there to allow easy appending of an additional empty
      * word at the end (this word is also NUL terminated).
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
index 0b8fe3c47..819a8dda4 100644
--- a/numpy/lib/tests/test_loadtxt.py
+++ b/numpy/lib/tests/test_loadtxt.py
@@ -1011,3 +1011,15 @@ def test_control_characters_as_bytes():
     """Byte control characters (comments, delimiter) are supported."""
     a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",")
     assert_equal(a, [1, 2, 3])
+
+
+@pytest.mark.filterwarnings('ignore::UserWarning')
+def test_field_growing_cases():
+    # Test empty field appending/growing (each field still takes 1 character)
+    # to see if the final field appending does not create issues.
+    res = np.loadtxt([""], delimiter=",", dtype=bytes)
+    assert len(res) == 0
+
+    for i in range(1, 1024):
+        res = np.loadtxt(["," * i], delimiter=",", dtype=bytes)
+        assert len(res) == i+1
author	Sebastian Berg <sebastianb@nvidia.com>	2022-12-19 19:40:20 +0100
committer	GitHub <noreply@github.com>	2022-12-19 10:40:20 -0800
commit	5e0ed03b6b7a6ff05843d846eedac730f20decb7 (patch)
tree	b96c2745c2354964ca854e5f5169d69dee0112bc
parent	ca257565537483ff3b37ab40013f2b7f344e968d (diff)
download	numpy-5e0ed03b6b7a6ff05843d846eedac730f20decb7.tar.gz