summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorSebastian Berg <sebastian@sipsolutions.net>2022-01-19 11:50:13 -0600
committerSebastian Berg <sebastian@sipsolutions.net>2022-01-19 11:52:43 -0600
commit9f9d755c9ab68a9ef3a5ffee8ea30d6a837732c1 (patch)
treee099ebf638aeadb64362d73b3953db08dc37fa3a /numpy
parente15d85324a1f5641aaccafbe3cc87556e88ff0a3 (diff)
downloadnumpy-9f9d755c9ab68a9ef3a5ffee8ea30d6a837732c1.tar.gz
ENH: Give a clear error when control characters match/are newlines
These never have a meaning (at best an implicit order meaning one is ignored), except theoretically in the `delimiter=None` case (or in strange cases with `\r` being a delimiter, but `\n` being the newline when a file is opened manually but not in universal newline mode). It seems more useful to just generally raise an error, since all of "features" are weird corner cases and likely surprising to users.
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/multiarray/textreading/readtext.c84
1 files changed, 84 insertions, 0 deletions
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
index 5d3613736..93b00d18f 100644
--- a/numpy/core/src/multiarray/textreading/readtext.c
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -100,6 +100,84 @@ parse_control_character(PyObject *obj, Py_UCS4 *character)
}
+/*
+ * A (somewhat verbose) check that none of the control characters match or are
+ * newline. Most of these combinations are completely fine, just weird or
+ * surprising.
+ * (I.e. there is an implicit priority for control characters, so if a comment
+ * matches a delimiter, it would just be a comment.)
+ * In theory some `delimiter=None` paths could have a "meaning", but let us
+ * assume that users are better of setting one of the control chars to `None`
+ * for clarity.
+ *
+ * This also checks that the control characters cannot be newlines.
+ */
+static int
+error_if_matching_control_characters(
+ Py_UCS4 delimiter, Py_UCS4 quote, Py_UCS4 comment)
+{
+ char *control_char1;
+ char *control_char2 = NULL;
+ if (comment != (Py_UCS4)-1) {
+ control_char1 = "comment";
+ if (comment == '\r' || comment == '\n') {
+ goto error;
+ }
+ else if (comment == quote) {
+ control_char2 = "quotechar";
+ goto error;
+ }
+ else if (comment == delimiter) {
+ control_char2 = "delimiter";
+ goto error;
+ }
+ }
+ if (quote != (Py_UCS4)-1) {
+ control_char1 = "quotechar";
+ if (quote == '\r' || quote == '\n') {
+ goto error;
+ }
+ else if (quote == delimiter) {
+ control_char2 = "delimiter";
+ goto error;
+ }
+ }
+ if (delimiter != (Py_UCS4)-1) {
+ control_char1 = "delimiter";
+ if (delimiter == '\r' || delimiter == '\n') {
+ goto error;
+ }
+ }
+ /* The above doesn't work with delimiter=None, which means "whitespace" */
+ if (delimiter == (Py_UCS4)-1) {
+ control_char1 = "delimiter";
+ if (Py_UNICODE_ISSPACE(comment)) {
+ control_char2 = "comment";
+ goto error;
+ }
+ else if (Py_UNICODE_ISSPACE(quote)) {
+ control_char2 = "quotechar";
+ goto error;
+ }
+ }
+ return 0;
+
+ error:
+ if (control_char2 != NULL) {
+ PyErr_Format(PyExc_TypeError,
+ "control characters '%s' and '%s' are identical, please set one"
+ "of them to `None` to indicate that it should not be used.",
+ control_char1, control_char2);
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "control character '%s' cannot be a newline (`\\r` or `\\n`).",
+ control_char1, control_char2);
+ }
+ return -1;
+}
+
+
NPY_NO_EXPORT PyObject *
_load_from_filelike(PyObject *NPY_UNUSED(mod),
PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
@@ -148,6 +226,12 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
return NULL;
}
+ /* Reject matching control characters, they just rarely make sense anyway */
+ if (error_if_matching_control_characters(
+ pc.delimiter, pc.quote, pc.comment) < 0) {
+ return NULL;
+ }
+
if (pc.delimiter == (Py_UCS4)-1) {
pc.delimiter_is_whitespace = true;
/* Ignore leading whitespace to match `string.split(None)` */