TST: Some tests for control character collisions.

Adds some tests for the behavior of control characters, e.g. comments, delimiter and quotechar, when they have the same value. At this stage, these tests are more to frame the discussion about what the behavior should be, not to test what it currently is. I personally think raising an exception is correct for most of these situations, though it's worth noting that np.loadtxt currently doesn't for most of these corner cases (and seems to randomly assign precedence to delimiter over comments or vice versa depending on the values).
author: Ross Barnowski <rossbar@berkeley.edu> 2022-01-28 09:17:25 -0800
committer: GitHub <noreply@github.com> 2022-01-28 11:17:25 -0600
commit: b335431699f86ab523dc6dba2c91efc799f4372b (patch)
tree: 17b544cf5a609746bd3b98c24f6db204efb9a39a
parent: 763a3d4878671d383ba8aa573af90fee125efff4 (diff)
download: numpy-b335431699f86ab523dc6dba2c91efc799f4372b.tar.gz
3 files changed, 59 insertions, 8 deletions
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
index 93b00d18f..7af5ee891 100644
--- a/numpy/core/src/multiarray/textreading/readtext.c
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -165,8 +165,8 @@ error_if_matching_control_characters(
   error:
     if (control_char2 != NULL) {
         PyErr_Format(PyExc_TypeError,
-                "control characters '%s' and '%s' are identical, please set one"
-                "of them to `None` to indicate that it should not be used.",
+                "The values for control characters '%s' and '%s' are "
+                "incompatible",
                 control_char1, control_char2);
     }
     else {
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 62d00cfb9..be313d104 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -920,12 +920,6 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
 
     if comment is None:
         comments = None
-    elif isinstance(comment, str):
-        if len(comment) > 1:  # length of 0 is rejected later
-            comments = (comment,)
-            comment = None
-        else:
-            comments = None
     else:
         # assume comments are a sequence of strings
         comments = tuple(comment)
@@ -938,6 +932,13 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
             if isinstance(comments[0], str) and len(comments[0]) == 1:
                 comment = comments[0]
                 comments = None
+        else:
+            # Input validation if there are multiple comment characters
+            if delimiter in comments:
+                raise TypeError(
+                    f"Comment characters '{comments}' cannot include the "
+                    f"delimiter '{delimiter}'"
+                )
 
     # comment is now either a 1 or 0 character string or a tuple:
     if comments is not None:
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
index c5a14ed46..2038bfc85 100644
--- a/numpy/lib/tests/test_loadtxt.py
+++ b/numpy/lib/tests/test_loadtxt.py
@@ -876,3 +876,53 @@ class TestCReaderUnitTests:
             data, dtype=np.dtype("U10"), filelike=True,
             quote='"', comment="#", skiplines=1)
         assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "])
+
+
+def test_delimiter_comment_collision_raises():
+    with pytest.raises(TypeError, match="control characters.*are identical"):
+        np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=",")
+
+
+def test_delimiter_quotechar_collision_raises():
+    with pytest.raises(TypeError, match="control characters.*are identical"):
+        np.loadtxt(StringIO("1, 2, 3"), delimiter=",", quotechar=",")
+
+
+def test_comment_quotechar_collision_raises():
+    with pytest.raises(TypeError, match="control characters.*are identical"):
+        np.loadtxt(StringIO("1 2 3"), comments="#", quotechar="#")
+
+
+def test_delimiter_and_multiple_comments_collision_raises():
+    with pytest.raises(
+        TypeError, match="Comment characters.*cannot include the delimiter"
+    ):
+        np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=["#", ","])
+
+
+@pytest.mark.parametrize(
+    "ws",
+    (
+        " ",  # space
+        "\t",  # tab
+        "\u2003",  # em
+        "\u00A0",  # non-break
+        "\u3000",  # ideographic space
+    )
+)
+def test_collision_with_default_delimiter_raises(ws):
+    with pytest.raises(TypeError, match="control characters.*are identical"):
+        np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), comments=ws)
+    with pytest.raises(TypeError, match="control characters.*are identical"):
+        np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), quotechar=ws)
+
+
+@pytest.mark.parametrize("nl", ("\n", "\r"))
+def test_control_character_newline_raises(nl):
+    txt = StringIO(f"1{nl}2{nl}3{nl}{nl}4{nl}5{nl}6{nl}{nl}")
+    with pytest.raises(TypeError, match="control character.*cannot be a newline"):
+        np.loadtxt(txt, delimiter=nl)
+    with pytest.raises(TypeError, match="control character.*cannot be a newline"):
+        np.loadtxt(txt, comments=nl)
+    with pytest.raises(TypeError, match="control character.*cannot be a newline"):
+        np.loadtxt(txt, quotechar=nl)
author	Ross Barnowski <rossbar@berkeley.edu>	2022-01-28 09:17:25 -0800
committer	GitHub <noreply@github.com>	2022-01-28 11:17:25 -0600
commit	b335431699f86ab523dc6dba2c91efc799f4372b (patch)
tree	17b544cf5a609746bd3b98c24f6db204efb9a39a
parent	763a3d4878671d383ba8aa573af90fee125efff4 (diff)
download	numpy-b335431699f86ab523dc6dba2c91efc799f4372b.tar.gz