MAINT: Move usecol handling to C and support more than integer cols

Of course to actually use that many columns you need A LOT of memory right now. Each field stores at least a UCS4 NUL character, but the field is padded enough to require 16 bytes. We always parse a full row, so that requires 20 bytes per field... (i.e. 32 GiB RAM is not enough to test this :)).
author: Sebastian Berg <sebastian@sipsolutions.net> 2022-01-12 22:36:29 -0600
committer: Sebastian Berg <sebastian@sipsolutions.net> 2022-01-14 20:07:07 -0600
commit: e2d9f6b8f34b45657773b42f1c1334e075b443b3 (patch)
tree: f9228d03d84828da3710100a2d5e448033a03eb7
parent: 334470edb1e4e4ea1bc87773ef6d0c6fd510486a (diff)
download: numpy-e2d9f6b8f34b45657773b42f1c1334e075b443b3.tar.gz
5 files changed, 74 insertions, 59 deletions
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
index fcf5056e2..869be4b02 100644
--- a/numpy/core/src/multiarray/textreading/readtext.c
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -8,6 +8,7 @@
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "npy_argparse.h"
+#include "common.h"
 #include "conversion_utils.h"
 
 #include "textreading/parser_config.h"
@@ -34,14 +35,13 @@
 // number of columns in the file must match the number of fields in `dtype`.
 //
 static PyObject *
-_readtext_from_stream(stream *s, parser_config *pc,
-                      PyObject *usecols, Py_ssize_t skiprows, Py_ssize_t max_rows,
-                      PyObject *converters, PyObject *dtype)
+_readtext_from_stream(stream *s,
+        parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[],
+        Py_ssize_t skiprows, Py_ssize_t max_rows,
+        PyObject *converters, PyObject *dtype)
 {
     PyArrayObject *arr = NULL;
     PyArray_Descr *out_dtype = NULL;
-    int32_t *cols;
-    int ncols;
     field_type *ft = NULL;
 
     /*
@@ -52,24 +52,24 @@ _readtext_from_stream(stream *s, parser_config *pc,
     out_dtype = (PyArray_Descr *)dtype;
     Py_INCREF(out_dtype);
 
-    npy_intp num_fields = field_types_create(out_dtype, &ft);
+    Py_ssize_t num_fields = field_types_create(out_dtype, &ft);
     if (num_fields < 0) {
         goto finish;
     }
     bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype;
 
-    if (usecols == Py_None) {
-        ncols = num_fields;
-        cols = NULL;
-    }
-    else {
-        ncols = PyArray_SIZE((PyArrayObject *)usecols);
-        cols = PyArray_DATA((PyArrayObject *)usecols);
+    if (!homogeneous && usecols != NULL && num_usecols != num_fields) {
+        PyErr_Format(PyExc_TypeError,
+                "If a structured dtype is used, the number of columns in "
+                "`usecols` must match the effective number of fields. "
+                "But %zd usecols were given and the number of fields is %zd.",
+                num_usecols, num_fields);
+        goto finish;
     }
 
     arr = read_rows(
             s, max_rows, num_fields, ft, pc,
-            ncols, cols, skiprows, converters,
+            num_usecols, usecols, skiprows, converters,
             NULL, out_dtype, homogeneous);
     if (arr == NULL) {
         goto finish;
@@ -107,7 +107,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
     PyObject *file;
     Py_ssize_t skiprows = 0;
     Py_ssize_t max_rows = -1;
-    PyObject *usecols = Py_None;
+    PyObject *usecols_obj = Py_None;
     PyObject *converters = Py_None;
 
     PyObject *dtype = Py_None;
@@ -136,7 +136,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
             "|comment", &parse_control_character, &pc.comment,
             "|quote", &parse_control_character, &pc.quote,
             "|imaginary_unit", &parse_control_character, &pc.imaginary_unit,
-            "|usecols", NULL, &usecols,
+            "|usecols", NULL, &usecols_obj,
             "|skiprows", &PyArray_IntpFromPyIntConverter, &skiprows,
             "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows,
             "|converters", NULL, &converters,
@@ -172,24 +172,38 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
             return NULL;
         }
     }
+
     /*
-     * TODO: It would be nicer to move usecol parsing to C, but we don't have
-     *       quite the right helper in NumPy yet so using a 1D, 32bit,
-     *       contiguous array.  (and ensure this is true)
-     * NOTE: This should never fail as the public API ensures the conditions
-     *       are met.
+     * Parse usecols, the rest of NumPy has no clear helper for this, so do
+     * it here manually.
      */
-    if (usecols != Py_None) {
-        if (!PyArray_CheckExact(usecols)
-                || PyArray_NDIM((PyArrayObject *)usecols) != 1
-                || !PyArray_ISCARRAY((PyArrayObject *)usecols)
-                || PyArray_DESCR((PyArrayObject *)usecols)->kind != 'i'
-                || PyArray_DESCR((PyArrayObject *)usecols)->elsize != 4
-                || PyArray_ISBYTESWAPPED((PyArrayObject *)usecols)) {
-            PyErr_SetString(PyExc_RuntimeError,
-                    "Internally a bad value was passed for usecols.");
+    Py_ssize_t num_usecols = -1;
+    Py_ssize_t *usecols = NULL;
+    if (usecols_obj != Py_None) {
+        num_usecols = PySequence_Length(usecols_obj);
+        if (num_usecols < 0) {
             return NULL;
         }
+        /* Calloc just to not worry about overflow */
+        usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t));
+        for (Py_ssize_t i = 0; i < num_usecols; i++) {
+            PyObject *tmp = PySequence_GetItem(usecols_obj, i);
+            if (tmp == NULL) {
+                return NULL;
+            }
+            usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
+            if (error_converting(usecols[i])) {
+                if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+                    PyErr_Format(PyExc_TypeError,
+                            "usecols must be an int or a sequence of ints but "
+                            "it contains at least one element of type '%s'",
+                            tmp->ob_type->tp_name);
+                }
+                Py_DECREF(tmp);
+                return NULL;
+            }
+            Py_DECREF(tmp);
+        }
     }
 
     stream *s;
@@ -201,12 +215,14 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
     }
     if (s == NULL) {
         PyErr_Format(PyExc_RuntimeError, "Unable to access the file.");
+        PyMem_FREE(usecols);
         return NULL;
     }
 
-    arr = _readtext_from_stream(s, &pc, usecols, skiprows, max_rows,
-                                converters, dtype);
+    arr = _readtext_from_stream(
+            s, &pc, num_usecols, usecols, skiprows, max_rows, converters, dtype);
     stream_close(s);
+    PyMem_FREE(usecols);
     return arr;
 }
 
diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c
index 08bc80cc4..8c95ba537 100644
--- a/numpy/core/src/multiarray/textreading/rows.c
+++ b/numpy/core/src/multiarray/textreading/rows.c
@@ -31,7 +31,7 @@
  */
 static PyObject **
 create_conv_funcs(
-        PyObject *converters, int num_fields, int32_t *usecols)
+        PyObject *converters, int num_fields, Py_ssize_t *usecols)
 {
     PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *));
     if (conv_funcs == NULL) {
@@ -155,8 +155,8 @@ create_conv_funcs(
  */
 NPY_NO_EXPORT PyArrayObject *
 read_rows(stream *s,
-        npy_intp max_rows, int num_field_types, field_type *field_types,
-        parser_config *pconfig, int num_usecols, int *usecols,
+        npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types,
+        parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
         Py_ssize_t skiplines, PyObject *converters,
         PyArrayObject *data_array, PyArray_Descr *out_descr,
         bool homogeneous)
@@ -187,6 +187,7 @@ read_rows(stream *s,
     int actual_num_fields = -1;
     if (usecols != NULL) {
         actual_num_fields = num_usecols;
+        assert(num_field_types == num_usecols);
     }
     else if (!homogeneous) {
         actual_num_fields = num_field_types;
@@ -330,9 +331,9 @@ read_rows(stream *s,
             }
         }
 
-        for (int i = 0; i < actual_num_fields; ++i) {
-            int f;  /* The field, either 0 (if homogeneous) or i. */
-            int col;  /* The column as read, remapped by usecols */
+        for (Py_ssize_t i = 0; i < actual_num_fields; ++i) {
+            Py_ssize_t f;  /* The field, either 0 (if homogeneous) or i. */
+            Py_ssize_t col;  /* The column as read, remapped by usecols */
             char *item_ptr;
             if (homogeneous) {
                 f = 0;
diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h
index 342af0a4b..20eb9e186 100644
--- a/numpy/core/src/multiarray/textreading/rows.h
+++ b/numpy/core/src/multiarray/textreading/rows.h
@@ -13,8 +13,8 @@
 
 NPY_NO_EXPORT PyArrayObject *
 read_rows(stream *s,
-        npy_intp nrows, int num_field_types, field_type *field_types,
-        parser_config *pconfig, int num_usecols, int *usecols,
+        npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types,
+        parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
         Py_ssize_t skiplines, PyObject *converters,
         PyArrayObject *data_array, PyArray_Descr *out_descr,
         bool homogeneous);
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 6e660dce1..f15f94580 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -909,25 +909,12 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
         dtype = np.dtype(object)
 
     if usecols is not None:
-        # Allow usecols to be a single int or a sequence of ints
+        # Allow usecols to be a single int or a sequence of ints, the C-code
+        # handles the rest
         try:
-            usecols_as_list = list(usecols)
+            usecols = list(usecols)
         except TypeError:
-            usecols_as_list = [usecols]
-        for col_idx in usecols_as_list:
-            try:
-                operator.index(col_idx)
-            except TypeError:
-                # Some unit tests for numpy.loadtxt require that the
-                # error message matches this format.
-                raise TypeError(
-                    "usecols must be an int or a sequence of ints but "
-                    "it contains at least one element of type %s" %
-                    type(col_idx),
-                    ) from None
-        # Fall back to existing code
-        usecols = np.array([operator.index(i) for i in usecols_as_list],
-                           dtype=np.int32)
+            usecols = [usecols]
 
     _ensure_ndmin_ndarray_check_param(ndmin)
 
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 548f5e5b4..6539a3d36 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -871,16 +871,27 @@ class TestLoadTxt(LoadTxtBase):
         bogus_idx = 1.5
         assert_raises_regex(
             TypeError,
-            '^usecols must be.*%s' % type(bogus_idx),
+            '^usecols must be.*%s' % type(bogus_idx).__name__,
             np.loadtxt, c, usecols=bogus_idx
             )
 
         assert_raises_regex(
             TypeError,
-            '^usecols must be.*%s' % type(bogus_idx),
+            '^usecols must be.*%s' % type(bogus_idx).__name__,
             np.loadtxt, c, usecols=[0, bogus_idx, 0]
             )
 
+    def test_bad_usecols(self):
+        with pytest.raises(OverflowError):
+            np.loadtxt(["1\n"], usecols=[2**64], delimiter=",")
+        with pytest.raises((ValueError, OverflowError)):
+            # Overflow error on 32bit platforms
+            np.loadtxt(["1\n"], usecols=[2**62], delimiter=",")
+        with pytest.raises(TypeError,
+                match="If a structured dtype .*. But 1 usecols were given and "
+                      "the number of fields is 3."):
+            np.loadtxt(["1,1\n"], dtype="i,(2)i", usecols=[0], delimiter=",")
+
     def test_fancy_dtype(self):
         c = TextIO()
         c.write('1,2,3.0\n4,5,6.0\n')
author	Sebastian Berg <sebastian@sipsolutions.net>	2022-01-12 22:36:29 -0600
committer	Sebastian Berg <sebastian@sipsolutions.net>	2022-01-14 20:07:07 -0600
commit	e2d9f6b8f34b45657773b42f1c1334e075b443b3 (patch)
tree	f9228d03d84828da3710100a2d5e448033a03eb7
parent	334470edb1e4e4ea1bc87773ef6d0c6fd510486a (diff)
download	numpy-e2d9f6b8f34b45657773b42f1c1334e075b443b3.tar.gz