diff options
author | Sebastian Berg <sebastian@sipsolutions.net> | 2022-01-12 22:36:29 -0600 |
---|---|---|
committer | Sebastian Berg <sebastian@sipsolutions.net> | 2022-01-14 20:07:07 -0600 |
commit | e2d9f6b8f34b45657773b42f1c1334e075b443b3 (patch) | |
tree | f9228d03d84828da3710100a2d5e448033a03eb7 | |
parent | 334470edb1e4e4ea1bc87773ef6d0c6fd510486a (diff) | |
download | numpy-e2d9f6b8f34b45657773b42f1c1334e075b443b3.tar.gz |
MAINT: Move usecol handling to C and support more than integer cols
Of course to actually use that many columns you need A LOT of memory
right now. Each field stores at least a UCS4 NUL character, but
the field is padded enough to require 16 bytes.
We always parse a full row, so that requires 20 bytes per field...
(i.e. 32 GiB RAM is not enough to test this :)).
-rw-r--r-- | numpy/core/src/multiarray/textreading/readtext.c | 80 | ||||
-rw-r--r-- | numpy/core/src/multiarray/textreading/rows.c | 13 | ||||
-rw-r--r-- | numpy/core/src/multiarray/textreading/rows.h | 4 | ||||
-rw-r--r-- | numpy/lib/npyio.py | 21 | ||||
-rw-r--r-- | numpy/lib/tests/test_io.py | 15 |
5 files changed, 74 insertions, 59 deletions
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index fcf5056e2..869be4b02 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -8,6 +8,7 @@ #define _MULTIARRAYMODULE #include "numpy/arrayobject.h" #include "npy_argparse.h" +#include "common.h" #include "conversion_utils.h" #include "textreading/parser_config.h" @@ -34,14 +35,13 @@ // number of columns in the file must match the number of fields in `dtype`. // static PyObject * -_readtext_from_stream(stream *s, parser_config *pc, - PyObject *usecols, Py_ssize_t skiprows, Py_ssize_t max_rows, - PyObject *converters, PyObject *dtype) +_readtext_from_stream(stream *s, + parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[], + Py_ssize_t skiprows, Py_ssize_t max_rows, + PyObject *converters, PyObject *dtype) { PyArrayObject *arr = NULL; PyArray_Descr *out_dtype = NULL; - int32_t *cols; - int ncols; field_type *ft = NULL; /* @@ -52,24 +52,24 @@ _readtext_from_stream(stream *s, parser_config *pc, out_dtype = (PyArray_Descr *)dtype; Py_INCREF(out_dtype); - npy_intp num_fields = field_types_create(out_dtype, &ft); + Py_ssize_t num_fields = field_types_create(out_dtype, &ft); if (num_fields < 0) { goto finish; } bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype; - if (usecols == Py_None) { - ncols = num_fields; - cols = NULL; - } - else { - ncols = PyArray_SIZE((PyArrayObject *)usecols); - cols = PyArray_DATA((PyArrayObject *)usecols); + if (!homogeneous && usecols != NULL && num_usecols != num_fields) { + PyErr_Format(PyExc_TypeError, + "If a structured dtype is used, the number of columns in " + "`usecols` must match the effective number of fields. " + "But %zd usecols were given and the number of fields is %zd.", + num_usecols, num_fields); + goto finish; } arr = read_rows( s, max_rows, num_fields, ft, pc, - ncols, cols, skiprows, converters, + num_usecols, usecols, skiprows, converters, NULL, out_dtype, homogeneous); if (arr == NULL) { goto finish; @@ -107,7 +107,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), PyObject *file; Py_ssize_t skiprows = 0; Py_ssize_t max_rows = -1; - PyObject *usecols = Py_None; + PyObject *usecols_obj = Py_None; PyObject *converters = Py_None; PyObject *dtype = Py_None; @@ -136,7 +136,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), "|comment", &parse_control_character, &pc.comment, "|quote", &parse_control_character, &pc.quote, "|imaginary_unit", &parse_control_character, &pc.imaginary_unit, - "|usecols", NULL, &usecols, + "|usecols", NULL, &usecols_obj, "|skiprows", &PyArray_IntpFromPyIntConverter, &skiprows, "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows, "|converters", NULL, &converters, @@ -172,24 +172,38 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), return NULL; } } + /* - * TODO: It would be nicer to move usecol parsing to C, but we don't have - * quite the right helper in NumPy yet so using a 1D, 32bit, - * contiguous array. (and ensure this is true) - * NOTE: This should never fail as the public API ensures the conditions - * are met. + * Parse usecols, the rest of NumPy has no clear helper for this, so do + * it here manually. */ - if (usecols != Py_None) { - if (!PyArray_CheckExact(usecols) - || PyArray_NDIM((PyArrayObject *)usecols) != 1 - || !PyArray_ISCARRAY((PyArrayObject *)usecols) - || PyArray_DESCR((PyArrayObject *)usecols)->kind != 'i' - || PyArray_DESCR((PyArrayObject *)usecols)->elsize != 4 - || PyArray_ISBYTESWAPPED((PyArrayObject *)usecols)) { - PyErr_SetString(PyExc_RuntimeError, - "Internally a bad value was passed for usecols."); + Py_ssize_t num_usecols = -1; + Py_ssize_t *usecols = NULL; + if (usecols_obj != Py_None) { + num_usecols = PySequence_Length(usecols_obj); + if (num_usecols < 0) { return NULL; } + /* Calloc just to not worry about overflow */ + usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t)); + for (Py_ssize_t i = 0; i < num_usecols; i++) { + PyObject *tmp = PySequence_GetItem(usecols_obj, i); + if (tmp == NULL) { + return NULL; + } + usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); + if (error_converting(usecols[i])) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Format(PyExc_TypeError, + "usecols must be an int or a sequence of ints but " + "it contains at least one element of type '%s'", + tmp->ob_type->tp_name); + } + Py_DECREF(tmp); + return NULL; + } + Py_DECREF(tmp); + } } stream *s; @@ -201,12 +215,14 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), } if (s == NULL) { PyErr_Format(PyExc_RuntimeError, "Unable to access the file."); + PyMem_FREE(usecols); return NULL; } - arr = _readtext_from_stream(s, &pc, usecols, skiprows, max_rows, - converters, dtype); + arr = _readtext_from_stream( + s, &pc, num_usecols, usecols, skiprows, max_rows, converters, dtype); stream_close(s); + PyMem_FREE(usecols); return arr; } diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 08bc80cc4..8c95ba537 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -31,7 +31,7 @@ */ static PyObject ** create_conv_funcs( - PyObject *converters, int num_fields, int32_t *usecols) + PyObject *converters, int num_fields, Py_ssize_t *usecols) { PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *)); if (conv_funcs == NULL) { @@ -155,8 +155,8 @@ create_conv_funcs( */ NPY_NO_EXPORT PyArrayObject * read_rows(stream *s, - npy_intp max_rows, int num_field_types, field_type *field_types, - parser_config *pconfig, int num_usecols, int *usecols, + npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types, + parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, Py_ssize_t skiplines, PyObject *converters, PyArrayObject *data_array, PyArray_Descr *out_descr, bool homogeneous) @@ -187,6 +187,7 @@ read_rows(stream *s, int actual_num_fields = -1; if (usecols != NULL) { actual_num_fields = num_usecols; + assert(num_field_types == num_usecols); } else if (!homogeneous) { actual_num_fields = num_field_types; @@ -330,9 +331,9 @@ read_rows(stream *s, } } - for (int i = 0; i < actual_num_fields; ++i) { - int f; /* The field, either 0 (if homogeneous) or i. */ - int col; /* The column as read, remapped by usecols */ + for (Py_ssize_t i = 0; i < actual_num_fields; ++i) { + Py_ssize_t f; /* The field, either 0 (if homogeneous) or i. */ + Py_ssize_t col; /* The column as read, remapped by usecols */ char *item_ptr; if (homogeneous) { f = 0; diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h index 342af0a4b..20eb9e186 100644 --- a/numpy/core/src/multiarray/textreading/rows.h +++ b/numpy/core/src/multiarray/textreading/rows.h @@ -13,8 +13,8 @@ NPY_NO_EXPORT PyArrayObject * read_rows(stream *s, - npy_intp nrows, int num_field_types, field_type *field_types, - parser_config *pconfig, int num_usecols, int *usecols, + npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types, + parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, Py_ssize_t skiplines, PyObject *converters, PyArrayObject *data_array, PyArray_Descr *out_descr, bool homogeneous); diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 6e660dce1..f15f94580 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -909,25 +909,12 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', dtype = np.dtype(object) if usecols is not None: - # Allow usecols to be a single int or a sequence of ints + # Allow usecols to be a single int or a sequence of ints, the C-code + # handles the rest try: - usecols_as_list = list(usecols) + usecols = list(usecols) except TypeError: - usecols_as_list = [usecols] - for col_idx in usecols_as_list: - try: - operator.index(col_idx) - except TypeError: - # Some unit tests for numpy.loadtxt require that the - # error message matches this format. - raise TypeError( - "usecols must be an int or a sequence of ints but " - "it contains at least one element of type %s" % - type(col_idx), - ) from None - # Fall back to existing code - usecols = np.array([operator.index(i) for i in usecols_as_list], - dtype=np.int32) + usecols = [usecols] _ensure_ndmin_ndarray_check_param(ndmin) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 548f5e5b4..6539a3d36 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -871,16 +871,27 @@ class TestLoadTxt(LoadTxtBase): bogus_idx = 1.5 assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx), + '^usecols must be.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=bogus_idx ) assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx), + '^usecols must be.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=[0, bogus_idx, 0] ) + def test_bad_usecols(self): + with pytest.raises(OverflowError): + np.loadtxt(["1\n"], usecols=[2**64], delimiter=",") + with pytest.raises((ValueError, OverflowError)): + # Overflow error on 32bit platforms + np.loadtxt(["1\n"], usecols=[2**62], delimiter=",") + with pytest.raises(TypeError, + match="If a structured dtype .*. But 1 usecols were given and " + "the number of fields is 3."): + np.loadtxt(["1,1\n"], dtype="i,(2)i", usecols=[0], delimiter=",") + def test_fancy_dtype(self): c = TextIO() c.write('1,2,3.0\n4,5,6.0\n') |