#define PY_SSIZE_T_CLEAN #include #include #include #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE #include "numpy/arrayobject.h" #include "npy_argparse.h" #include "common.h" #include "conversion_utils.h" #include "textreading/parser_config.h" #include "textreading/stream_pyobject.h" #include "textreading/field_types.h" #include "textreading/rows.h" #include "textreading/str_to_int.h" // // `usecols` must point to a Python object that is Py_None or a 1-d contiguous // numpy array with data type int32. // // `dtype` must point to a Python object that is Py_None or a numpy dtype // instance. If the latter, code and sizes must be arrays of length // num_dtype_fields, holding the flattened data field type codes and byte // sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype, // but we do that in Python code.) // // If both `usecols` and `dtype` are not None, and the data type is compound, // then len(usecols) must equal num_dtype_fields. // // If `dtype` is given and it is compound, and `usecols` is None, then the // number of columns in the file must match the number of fields in `dtype`. // static PyObject * _readtext_from_stream(stream *s, parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[], Py_ssize_t skiplines, Py_ssize_t max_rows, PyObject *converters, PyObject *dtype) { PyArrayObject *arr = NULL; PyArray_Descr *out_dtype = NULL; field_type *ft = NULL; /* * If dtypes[0] is dtype the input was not structured and the result * is considered "homogeneous" and we have to discover the number of * columns/ */ out_dtype = (PyArray_Descr *)dtype; Py_INCREF(out_dtype); Py_ssize_t num_fields = field_types_create(out_dtype, &ft); if (num_fields < 0) { goto finish; } bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype; if (!homogeneous && usecols != NULL && num_usecols != num_fields) { PyErr_Format(PyExc_TypeError, "If a structured dtype is used, the number of columns in " "`usecols` must match the effective number of fields. " "But %zd usecols were given and the number of fields is %zd.", num_usecols, num_fields); goto finish; } arr = read_rows( s, max_rows, num_fields, ft, pc, num_usecols, usecols, skiplines, converters, NULL, out_dtype, homogeneous); if (arr == NULL) { goto finish; } finish: Py_XDECREF(out_dtype); field_types_xclear(num_fields, ft); return (PyObject *)arr; } static int parse_control_character(PyObject *obj, Py_UCS4 *character) { if (obj == Py_None) { *character = (Py_UCS4)-1; /* character beyond unicode range */ return 1; } if (!PyUnicode_Check(obj) || PyUnicode_GetLength(obj) != 1) { PyErr_Format(PyExc_TypeError, "Text reading control character must be a single unicode " "character or None; but got: %.100R", obj); return 0; } *character = PyUnicode_READ_CHAR(obj, 0); return 1; } /* * A (somewhat verbose) check that none of the control characters match or are * newline. Most of these combinations are completely fine, just weird or * surprising. * (I.e. there is an implicit priority for control characters, so if a comment * matches a delimiter, it would just be a comment.) * In theory some `delimiter=None` paths could have a "meaning", but let us * assume that users are better of setting one of the control chars to `None` * for clarity. * * This also checks that the control characters cannot be newlines. */ static int error_if_matching_control_characters( Py_UCS4 delimiter, Py_UCS4 quote, Py_UCS4 comment) { char *control_char1; char *control_char2 = NULL; if (comment != (Py_UCS4)-1) { control_char1 = "comment"; if (comment == '\r' || comment == '\n') { goto error; } else if (comment == quote) { control_char2 = "quotechar"; goto error; } else if (comment == delimiter) { control_char2 = "delimiter"; goto error; } } if (quote != (Py_UCS4)-1) { control_char1 = "quotechar"; if (quote == '\r' || quote == '\n') { goto error; } else if (quote == delimiter) { control_char2 = "delimiter"; goto error; } } if (delimiter != (Py_UCS4)-1) { control_char1 = "delimiter"; if (delimiter == '\r' || delimiter == '\n') { goto error; } } /* The above doesn't work with delimiter=None, which means "whitespace" */ if (delimiter == (Py_UCS4)-1) { control_char1 = "delimiter"; if (Py_UNICODE_ISSPACE(comment)) { control_char2 = "comment"; goto error; } else if (Py_UNICODE_ISSPACE(quote)) { control_char2 = "quotechar"; goto error; } } return 0; error: if (control_char2 != NULL) { PyErr_Format(PyExc_TypeError, "The values for control characters '%s' and '%s' are " "incompatible", control_char1, control_char2); } else { PyErr_Format(PyExc_TypeError, "control character '%s' cannot be a newline (`\\r` or `\\n`).", control_char1, control_char2); } return -1; } NPY_NO_EXPORT PyObject * _load_from_filelike(PyObject *NPY_UNUSED(mod), PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) { PyObject *file; Py_ssize_t skiplines = 0; Py_ssize_t max_rows = -1; PyObject *usecols_obj = Py_None; PyObject *converters = Py_None; PyObject *dtype = Py_None; PyObject *encoding_obj = Py_None; const char *encoding = NULL; parser_config pc = { .delimiter = ',', .quote = '"', .comment = '#', .ignore_leading_whitespace = false, .delimiter_is_whitespace = false, .imaginary_unit = 'j', .python_byte_converters = false, .c_byte_converters = false, .gave_int_via_float_warning = false, }; bool filelike = true; PyObject *arr = NULL; NPY_PREPARE_ARGPARSER; if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames, "file", NULL, &file, "|delimiter", &parse_control_character, &pc.delimiter, "|comment", &parse_control_character, &pc.comment, "|quote", &parse_control_character, &pc.quote, "|imaginary_unit", &parse_control_character, &pc.imaginary_unit, "|usecols", NULL, &usecols_obj, "|skiplines", &PyArray_IntpFromPyIntConverter, &skiplines, "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows, "|converters", NULL, &converters, "|dtype", NULL, &dtype, "|encoding", NULL, &encoding_obj, "|filelike", &PyArray_BoolConverter, &filelike, "|byte_converters", &PyArray_BoolConverter, &pc.python_byte_converters, "|c_byte_converters", PyArray_BoolConverter, &pc.c_byte_converters, NULL, NULL, NULL) < 0) { return NULL; } /* Reject matching control characters, they just rarely make sense anyway */ if (error_if_matching_control_characters( pc.delimiter, pc.quote, pc.comment) < 0) { return NULL; } if (pc.delimiter == (Py_UCS4)-1) { pc.delimiter_is_whitespace = true; /* Ignore leading whitespace to match `string.split(None)` */ pc.ignore_leading_whitespace = true; } if (!PyArray_DescrCheck(dtype) ) { PyErr_SetString(PyExc_TypeError, "internal error: dtype must be provided and be a NumPy dtype"); return NULL; } if (encoding_obj != Py_None) { if (!PyUnicode_Check(encoding_obj)) { PyErr_SetString(PyExc_TypeError, "encoding must be a unicode string."); return NULL; } encoding = PyUnicode_AsUTF8(encoding_obj); if (encoding == NULL) { return NULL; } } /* * Parse usecols, the rest of NumPy has no clear helper for this, so do * it here manually. */ Py_ssize_t num_usecols = -1; Py_ssize_t *usecols = NULL; if (usecols_obj != Py_None) { num_usecols = PySequence_Length(usecols_obj); if (num_usecols < 0) { return NULL; } /* Calloc just to not worry about overflow */ usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t)); if (usecols == NULL) { PyErr_NoMemory(); return NULL; } for (Py_ssize_t i = 0; i < num_usecols; i++) { PyObject *tmp = PySequence_GetItem(usecols_obj, i); if (tmp == NULL) { PyMem_FREE(usecols); return NULL; } usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); if (error_converting(usecols[i])) { if (PyErr_ExceptionMatches(PyExc_TypeError)) { PyErr_Format(PyExc_TypeError, "usecols must be an int or a sequence of ints but " "it contains at least one element of type '%s'", Py_TYPE(tmp)->tp_name); } Py_DECREF(tmp); PyMem_FREE(usecols); return NULL; } Py_DECREF(tmp); } } stream *s; if (filelike) { s = stream_python_file(file, encoding); } else { s = stream_python_iterable(file, encoding); } if (s == NULL) { PyMem_FREE(usecols); return NULL; } arr = _readtext_from_stream( s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype); stream_close(s); PyMem_FREE(usecols); return arr; }