ENH: Give a clear error when control characters match/are newlines

These never have a meaning (at best an implicit order meaning one is ignored), except theoretically in the `delimiter=None` case (or in strange cases with `\r` being a delimiter, but `\n` being the newline when a file is opened manually but not in universal newline mode). It seems more useful to just generally raise an error, since all of "features" are weird corner cases and likely surprising to users.
author: Sebastian Berg <sebastian@sipsolutions.net> 2022-01-19 11:50:13 -0600
committer: Sebastian Berg <sebastian@sipsolutions.net> 2022-01-19 11:52:43 -0600
commit: 9f9d755c9ab68a9ef3a5ffee8ea30d6a837732c1 (patch)
tree: e099ebf638aeadb64362d73b3953db08dc37fa3a /numpy
parent: e15d85324a1f5641aaccafbe3cc87556e88ff0a3 (diff)
download: numpy-9f9d755c9ab68a9ef3a5ffee8ea30d6a837732c1.tar.gz
1 files changed, 84 insertions, 0 deletions
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
index 5d3613736..93b00d18f 100644
--- a/numpy/core/src/multiarray/textreading/readtext.c
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -100,6 +100,84 @@ parse_control_character(PyObject *obj, Py_UCS4 *character)
 }
 
 
+/*
+ * A (somewhat verbose) check that none of the control characters match or are
+ * newline.  Most of these combinations are completely fine, just weird or
+ * surprising.
+ * (I.e. there is an implicit priority for control characters, so if a comment
+ * matches a delimiter, it would just be a comment.)
+ * In theory some `delimiter=None` paths could have a "meaning", but let us
+ * assume that users are better of setting one of the control chars to `None`
+ * for clarity.
+ *
+ * This also checks that the control characters cannot be newlines.
+ */
+static int
+error_if_matching_control_characters(
+        Py_UCS4 delimiter, Py_UCS4 quote, Py_UCS4 comment)
+{
+    char *control_char1;
+    char *control_char2 = NULL;
+    if (comment != (Py_UCS4)-1) {
+        control_char1 = "comment";
+        if (comment == '\r' || comment == '\n') {
+            goto error;
+        }
+        else if (comment == quote) {
+            control_char2 = "quotechar";
+            goto error;
+        }
+        else if (comment == delimiter) {
+            control_char2 = "delimiter";
+            goto error;
+        }
+    }
+    if (quote != (Py_UCS4)-1) {
+        control_char1 = "quotechar";
+        if (quote == '\r' || quote == '\n') {
+            goto error;
+        }
+        else if (quote == delimiter) {
+            control_char2 = "delimiter";
+            goto error;
+        }
+    }
+    if (delimiter != (Py_UCS4)-1) {
+        control_char1 = "delimiter";
+        if (delimiter == '\r' || delimiter == '\n') {
+            goto error;
+        }
+    }
+    /* The above doesn't work with delimiter=None, which means "whitespace" */
+    if (delimiter == (Py_UCS4)-1) {
+        control_char1 = "delimiter";
+        if (Py_UNICODE_ISSPACE(comment)) {
+            control_char2 = "comment";
+            goto error;
+        }
+        else if (Py_UNICODE_ISSPACE(quote)) {
+            control_char2 = "quotechar";
+            goto error;
+        }
+    }
+    return 0;
+
+  error:
+    if (control_char2 != NULL) {
+        PyErr_Format(PyExc_TypeError,
+                "control characters '%s' and '%s' are identical, please set one"
+                "of them to `None` to indicate that it should not be used.",
+                control_char1, control_char2);
+    }
+    else {
+        PyErr_Format(PyExc_TypeError,
+                "control character '%s' cannot be a newline (`\\r` or `\\n`).",
+                control_char1, control_char2);
+    }
+    return -1;
+}
+
+
 NPY_NO_EXPORT PyObject *
 _load_from_filelike(PyObject *NPY_UNUSED(mod),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
@@ -148,6 +226,12 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
         return NULL;
     }
 
+    /* Reject matching control characters, they just rarely make sense anyway */
+    if (error_if_matching_control_characters(
+            pc.delimiter, pc.quote, pc.comment) < 0) {
+        return NULL;
+    }
+
     if (pc.delimiter == (Py_UCS4)-1) {
         pc.delimiter_is_whitespace = true;
         /* Ignore leading whitespace to match `string.split(None)` */
author	Sebastian Berg <sebastian@sipsolutions.net>	2022-01-19 11:50:13 -0600
committer	Sebastian Berg <sebastian@sipsolutions.net>	2022-01-19 11:52:43 -0600
commit	9f9d755c9ab68a9ef3a5ffee8ea30d6a837732c1 (patch)
tree	e099ebf638aeadb64362d73b3953db08dc37fa3a /numpy
parent	e15d85324a1f5641aaccafbe3cc87556e88ff0a3 (diff)
download	numpy-9f9d755c9ab68a9ef3a5ffee8ea30d6a837732c1.tar.gz