diff options
| author | Sebastian Berg <sebastian@sipsolutions.net> | 2022-01-19 11:50:13 -0600 |
|---|---|---|
| committer | Sebastian Berg <sebastian@sipsolutions.net> | 2022-01-19 11:52:43 -0600 |
| commit | 9f9d755c9ab68a9ef3a5ffee8ea30d6a837732c1 (patch) | |
| tree | e099ebf638aeadb64362d73b3953db08dc37fa3a /numpy | |
| parent | e15d85324a1f5641aaccafbe3cc87556e88ff0a3 (diff) | |
| download | numpy-9f9d755c9ab68a9ef3a5ffee8ea30d6a837732c1.tar.gz | |
ENH: Give a clear error when control characters match/are newlines
These never have a meaning (at best an implicit order meaning one
is ignored), except theoretically in the `delimiter=None` case
(or in strange cases with `\r` being a delimiter, but `\n` being
the newline when a file is opened manually but not in universal
newline mode).
It seems more useful to just generally raise an error, since all of
"features" are weird corner cases and likely surprising to users.
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/src/multiarray/textreading/readtext.c | 84 |
1 files changed, 84 insertions, 0 deletions
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 5d3613736..93b00d18f 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -100,6 +100,84 @@ parse_control_character(PyObject *obj, Py_UCS4 *character) } +/* + * A (somewhat verbose) check that none of the control characters match or are + * newline. Most of these combinations are completely fine, just weird or + * surprising. + * (I.e. there is an implicit priority for control characters, so if a comment + * matches a delimiter, it would just be a comment.) + * In theory some `delimiter=None` paths could have a "meaning", but let us + * assume that users are better of setting one of the control chars to `None` + * for clarity. + * + * This also checks that the control characters cannot be newlines. + */ +static int +error_if_matching_control_characters( + Py_UCS4 delimiter, Py_UCS4 quote, Py_UCS4 comment) +{ + char *control_char1; + char *control_char2 = NULL; + if (comment != (Py_UCS4)-1) { + control_char1 = "comment"; + if (comment == '\r' || comment == '\n') { + goto error; + } + else if (comment == quote) { + control_char2 = "quotechar"; + goto error; + } + else if (comment == delimiter) { + control_char2 = "delimiter"; + goto error; + } + } + if (quote != (Py_UCS4)-1) { + control_char1 = "quotechar"; + if (quote == '\r' || quote == '\n') { + goto error; + } + else if (quote == delimiter) { + control_char2 = "delimiter"; + goto error; + } + } + if (delimiter != (Py_UCS4)-1) { + control_char1 = "delimiter"; + if (delimiter == '\r' || delimiter == '\n') { + goto error; + } + } + /* The above doesn't work with delimiter=None, which means "whitespace" */ + if (delimiter == (Py_UCS4)-1) { + control_char1 = "delimiter"; + if (Py_UNICODE_ISSPACE(comment)) { + control_char2 = "comment"; + goto error; + } + else if (Py_UNICODE_ISSPACE(quote)) { + control_char2 = "quotechar"; + goto error; + } + } + return 0; + + error: + if (control_char2 != NULL) { + PyErr_Format(PyExc_TypeError, + "control characters '%s' and '%s' are identical, please set one" + "of them to `None` to indicate that it should not be used.", + control_char1, control_char2); + } + else { + PyErr_Format(PyExc_TypeError, + "control character '%s' cannot be a newline (`\\r` or `\\n`).", + control_char1, control_char2); + } + return -1; +} + + NPY_NO_EXPORT PyObject * _load_from_filelike(PyObject *NPY_UNUSED(mod), PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) @@ -148,6 +226,12 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), return NULL; } + /* Reject matching control characters, they just rarely make sense anyway */ + if (error_if_matching_control_characters( + pc.delimiter, pc.quote, pc.comment) < 0) { + return NULL; + } + if (pc.delimiter == (Py_UCS4)-1) { pc.delimiter_is_whitespace = true; /* Ignore leading whitespace to match `string.split(None)` */ |
