summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/multiarray/textreading/readtext.c84
1 files changed, 84 insertions, 0 deletions
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
index 5d3613736..93b00d18f 100644
--- a/numpy/core/src/multiarray/textreading/readtext.c
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -100,6 +100,84 @@ parse_control_character(PyObject *obj, Py_UCS4 *character)
}
+/*
+ * A (somewhat verbose) check that none of the control characters match or are
+ * newline. Most of these combinations are completely fine, just weird or
+ * surprising.
+ * (I.e. there is an implicit priority for control characters, so if a comment
+ * matches a delimiter, it would just be a comment.)
+ * In theory some `delimiter=None` paths could have a "meaning", but let us
+ * assume that users are better of setting one of the control chars to `None`
+ * for clarity.
+ *
+ * This also checks that the control characters cannot be newlines.
+ */
+static int
+error_if_matching_control_characters(
+ Py_UCS4 delimiter, Py_UCS4 quote, Py_UCS4 comment)
+{
+ char *control_char1;
+ char *control_char2 = NULL;
+ if (comment != (Py_UCS4)-1) {
+ control_char1 = "comment";
+ if (comment == '\r' || comment == '\n') {
+ goto error;
+ }
+ else if (comment == quote) {
+ control_char2 = "quotechar";
+ goto error;
+ }
+ else if (comment == delimiter) {
+ control_char2 = "delimiter";
+ goto error;
+ }
+ }
+ if (quote != (Py_UCS4)-1) {
+ control_char1 = "quotechar";
+ if (quote == '\r' || quote == '\n') {
+ goto error;
+ }
+ else if (quote == delimiter) {
+ control_char2 = "delimiter";
+ goto error;
+ }
+ }
+ if (delimiter != (Py_UCS4)-1) {
+ control_char1 = "delimiter";
+ if (delimiter == '\r' || delimiter == '\n') {
+ goto error;
+ }
+ }
+ /* The above doesn't work with delimiter=None, which means "whitespace" */
+ if (delimiter == (Py_UCS4)-1) {
+ control_char1 = "delimiter";
+ if (Py_UNICODE_ISSPACE(comment)) {
+ control_char2 = "comment";
+ goto error;
+ }
+ else if (Py_UNICODE_ISSPACE(quote)) {
+ control_char2 = "quotechar";
+ goto error;
+ }
+ }
+ return 0;
+
+ error:
+ if (control_char2 != NULL) {
+ PyErr_Format(PyExc_TypeError,
+ "control characters '%s' and '%s' are identical, please set one"
+ "of them to `None` to indicate that it should not be used.",
+ control_char1, control_char2);
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "control character '%s' cannot be a newline (`\\r` or `\\n`).",
+ control_char1, control_char2);
+ }
+ return -1;
+}
+
+
NPY_NO_EXPORT PyObject *
_load_from_filelike(PyObject *NPY_UNUSED(mod),
PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
@@ -148,6 +226,12 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
return NULL;
}
+ /* Reject matching control characters, they just rarely make sense anyway */
+ if (error_if_matching_control_characters(
+ pc.delimiter, pc.quote, pc.comment) < 0) {
+ return NULL;
+ }
+
if (pc.delimiter == (Py_UCS4)-1) {
pc.delimiter_is_whitespace = true;
/* Ignore leading whitespace to match `string.split(None)` */