diff options
author | dmbelov <dmbelov@gmail.com> | 2023-01-01 07:38:57 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-01-01 13:38:57 +0100 |
commit | 6d474f2dfe5e4b7ea2a6e7423a638316fa186227 (patch) | |
tree | f7b19ec9919f95433e6ee41ebb596d0df28ecd4c | |
parent | 77f15776e1d6d1357cd8c45dd297da821e18e0c2 (diff) | |
download | numpy-6d474f2dfe5e4b7ea2a6e7423a638316fa186227.tar.gz |
BUG: np.loadtxt cannot load text file with quoted fields separated by whitespace (#22906)
Fix issue with `delimiter=None` and quote character not working properly (not using whitespace delimiter mode).
Closes gh-22899
-rw-r--r-- | numpy/core/src/multiarray/textreading/tokenize.cpp | 3 | ||||
-rw-r--r-- | numpy/lib/npyio.py | 8 | ||||
-rw-r--r-- | numpy/lib/tests/test_loadtxt.py | 14 |
3 files changed, 24 insertions, 1 deletions
diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp index ff7e7a8c1..cc5e621d6 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.cpp +++ b/numpy/core/src/multiarray/textreading/tokenize.cpp @@ -223,7 +223,8 @@ tokenizer_core(tokenizer_state *ts, parser_config *const config) } else { /* continue parsing as if unquoted */ - ts->state = TOKENIZE_UNQUOTED; + /* Set to TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE */ + ts->state = ts->unquoted_state; } break; diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 4a27c7898..71d600c30 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1303,6 +1303,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, array([('alpha, #42', 10.), ('beta, #64', 2.)], dtype=[('label', '<U12'), ('value', '<f8')]) + Quoted fields can be separated by multiple whitespace characters: + + >>> s = StringIO('"alpha, #42" 10.0\n"beta, #64" 2.0\n') + >>> dtype = np.dtype([("label", "U12"), ("value", float)]) + >>> np.loadtxt(s, dtype=dtype, delimiter=None, quotechar='"') + array([('alpha, #42', 10.), ('beta, #64', 2.)], + dtype=[('label', '<U12'), ('value', '<f8')]) + Two consecutive quote characters within a quoted field are treated as a single escaped character: diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 819a8dda4..8a5b044b8 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -534,6 +534,20 @@ def test_quoted_field(q): assert_array_equal(res, expected) +@pytest.mark.parametrize("q", ('"', "'", "`")) +def test_quoted_field_with_whitepace_delimiter(q): + txt = StringIO( + f"{q}alpha, x{q} 2.5\n{q}beta, y{q} 4.5\n{q}gamma, z{q} 5.0\n" + ) + dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)]) + expected = np.array( + [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype + ) + + res = np.loadtxt(txt, dtype=dtype, delimiter=None, quotechar=q) + assert_array_equal(res, expected) + + def test_quote_support_default(): """Support for quoted fields is disabled by default.""" txt = StringIO('"lat,long", 45, 30\n') |