diff options
| author | Ross Barnowski <rossbar@berkeley.edu> | 2022-01-28 09:17:25 -0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-01-28 11:17:25 -0600 |
| commit | b335431699f86ab523dc6dba2c91efc799f4372b (patch) | |
| tree | 17b544cf5a609746bd3b98c24f6db204efb9a39a | |
| parent | 763a3d4878671d383ba8aa573af90fee125efff4 (diff) | |
| download | numpy-b335431699f86ab523dc6dba2c91efc799f4372b.tar.gz | |
TST: Some tests for control character collisions.
Adds some tests for the behavior of control characters, e.g. comments, delimiter and quotechar, when they have the same value. At this stage, these tests are more to frame the discussion about what the behavior should be, not to test what it currently is. I personally think raising an exception is correct for most of these situations, though it's worth noting that np.loadtxt currently doesn't for most of these corner cases (and seems to randomly assign precedence to delimiter over comments or vice versa depending on the values).
| -rw-r--r-- | numpy/core/src/multiarray/textreading/readtext.c | 4 | ||||
| -rw-r--r-- | numpy/lib/npyio.py | 13 | ||||
| -rw-r--r-- | numpy/lib/tests/test_loadtxt.py | 50 |
3 files changed, 59 insertions, 8 deletions
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 93b00d18f..7af5ee891 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -165,8 +165,8 @@ error_if_matching_control_characters( error: if (control_char2 != NULL) { PyErr_Format(PyExc_TypeError, - "control characters '%s' and '%s' are identical, please set one" - "of them to `None` to indicate that it should not be used.", + "The values for control characters '%s' and '%s' are " + "incompatible", control_char1, control_char2); } else { diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 62d00cfb9..be313d104 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -920,12 +920,6 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', if comment is None: comments = None - elif isinstance(comment, str): - if len(comment) > 1: # length of 0 is rejected later - comments = (comment,) - comment = None - else: - comments = None else: # assume comments are a sequence of strings comments = tuple(comment) @@ -938,6 +932,13 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', if isinstance(comments[0], str) and len(comments[0]) == 1: comment = comments[0] comments = None + else: + # Input validation if there are multiple comment characters + if delimiter in comments: + raise TypeError( + f"Comment characters '{comments}' cannot include the " + f"delimiter '{delimiter}'" + ) # comment is now either a 1 or 0 character string or a tuple: if comments is not None: diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index c5a14ed46..2038bfc85 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -876,3 +876,53 @@ class TestCReaderUnitTests: data, dtype=np.dtype("U10"), filelike=True, quote='"', comment="#", skiplines=1) assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "]) + + +def test_delimiter_comment_collision_raises(): + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=",") + + +def test_delimiter_quotechar_collision_raises(): + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", quotechar=",") + + +def test_comment_quotechar_collision_raises(): + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO("1 2 3"), comments="#", quotechar="#") + + +def test_delimiter_and_multiple_comments_collision_raises(): + with pytest.raises( + TypeError, match="Comment characters.*cannot include the delimiter" + ): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=["#", ","]) + + +@pytest.mark.parametrize( + "ws", + ( + " ", # space + "\t", # tab + "\u2003", # em + "\u00A0", # non-break + "\u3000", # ideographic space + ) +) +def test_collision_with_default_delimiter_raises(ws): + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), comments=ws) + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), quotechar=ws) + + +@pytest.mark.parametrize("nl", ("\n", "\r")) +def test_control_character_newline_raises(nl): + txt = StringIO(f"1{nl}2{nl}3{nl}{nl}4{nl}5{nl}6{nl}{nl}") + with pytest.raises(TypeError, match="control character.*cannot be a newline"): + np.loadtxt(txt, delimiter=nl) + with pytest.raises(TypeError, match="control character.*cannot be a newline"): + np.loadtxt(txt, comments=nl) + with pytest.raises(TypeError, match="control character.*cannot be a newline"): + np.loadtxt(txt, quotechar=nl) |
