summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Berg <sebastian@sipsolutions.net>2022-01-12 22:36:29 -0600
committerSebastian Berg <sebastian@sipsolutions.net>2022-01-14 20:07:07 -0600
commite2d9f6b8f34b45657773b42f1c1334e075b443b3 (patch)
treef9228d03d84828da3710100a2d5e448033a03eb7
parent334470edb1e4e4ea1bc87773ef6d0c6fd510486a (diff)
downloadnumpy-e2d9f6b8f34b45657773b42f1c1334e075b443b3.tar.gz
MAINT: Move usecol handling to C and support more than integer cols
Of course to actually use that many columns you need A LOT of memory right now. Each field stores at least a UCS4 NUL character, but the field is padded enough to require 16 bytes. We always parse a full row, so that requires 20 bytes per field... (i.e. 32 GiB RAM is not enough to test this :)).
-rw-r--r--numpy/core/src/multiarray/textreading/readtext.c80
-rw-r--r--numpy/core/src/multiarray/textreading/rows.c13
-rw-r--r--numpy/core/src/multiarray/textreading/rows.h4
-rw-r--r--numpy/lib/npyio.py21
-rw-r--r--numpy/lib/tests/test_io.py15
5 files changed, 74 insertions, 59 deletions
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
index fcf5056e2..869be4b02 100644
--- a/numpy/core/src/multiarray/textreading/readtext.c
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -8,6 +8,7 @@
#define _MULTIARRAYMODULE
#include "numpy/arrayobject.h"
#include "npy_argparse.h"
+#include "common.h"
#include "conversion_utils.h"
#include "textreading/parser_config.h"
@@ -34,14 +35,13 @@
// number of columns in the file must match the number of fields in `dtype`.
//
static PyObject *
-_readtext_from_stream(stream *s, parser_config *pc,
- PyObject *usecols, Py_ssize_t skiprows, Py_ssize_t max_rows,
- PyObject *converters, PyObject *dtype)
+_readtext_from_stream(stream *s,
+ parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[],
+ Py_ssize_t skiprows, Py_ssize_t max_rows,
+ PyObject *converters, PyObject *dtype)
{
PyArrayObject *arr = NULL;
PyArray_Descr *out_dtype = NULL;
- int32_t *cols;
- int ncols;
field_type *ft = NULL;
/*
@@ -52,24 +52,24 @@ _readtext_from_stream(stream *s, parser_config *pc,
out_dtype = (PyArray_Descr *)dtype;
Py_INCREF(out_dtype);
- npy_intp num_fields = field_types_create(out_dtype, &ft);
+ Py_ssize_t num_fields = field_types_create(out_dtype, &ft);
if (num_fields < 0) {
goto finish;
}
bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype;
- if (usecols == Py_None) {
- ncols = num_fields;
- cols = NULL;
- }
- else {
- ncols = PyArray_SIZE((PyArrayObject *)usecols);
- cols = PyArray_DATA((PyArrayObject *)usecols);
+ if (!homogeneous && usecols != NULL && num_usecols != num_fields) {
+ PyErr_Format(PyExc_TypeError,
+ "If a structured dtype is used, the number of columns in "
+ "`usecols` must match the effective number of fields. "
+ "But %zd usecols were given and the number of fields is %zd.",
+ num_usecols, num_fields);
+ goto finish;
}
arr = read_rows(
s, max_rows, num_fields, ft, pc,
- ncols, cols, skiprows, converters,
+ num_usecols, usecols, skiprows, converters,
NULL, out_dtype, homogeneous);
if (arr == NULL) {
goto finish;
@@ -107,7 +107,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
PyObject *file;
Py_ssize_t skiprows = 0;
Py_ssize_t max_rows = -1;
- PyObject *usecols = Py_None;
+ PyObject *usecols_obj = Py_None;
PyObject *converters = Py_None;
PyObject *dtype = Py_None;
@@ -136,7 +136,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
"|comment", &parse_control_character, &pc.comment,
"|quote", &parse_control_character, &pc.quote,
"|imaginary_unit", &parse_control_character, &pc.imaginary_unit,
- "|usecols", NULL, &usecols,
+ "|usecols", NULL, &usecols_obj,
"|skiprows", &PyArray_IntpFromPyIntConverter, &skiprows,
"|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows,
"|converters", NULL, &converters,
@@ -172,24 +172,38 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
return NULL;
}
}
+
/*
- * TODO: It would be nicer to move usecol parsing to C, but we don't have
- * quite the right helper in NumPy yet so using a 1D, 32bit,
- * contiguous array. (and ensure this is true)
- * NOTE: This should never fail as the public API ensures the conditions
- * are met.
+ * Parse usecols, the rest of NumPy has no clear helper for this, so do
+ * it here manually.
*/
- if (usecols != Py_None) {
- if (!PyArray_CheckExact(usecols)
- || PyArray_NDIM((PyArrayObject *)usecols) != 1
- || !PyArray_ISCARRAY((PyArrayObject *)usecols)
- || PyArray_DESCR((PyArrayObject *)usecols)->kind != 'i'
- || PyArray_DESCR((PyArrayObject *)usecols)->elsize != 4
- || PyArray_ISBYTESWAPPED((PyArrayObject *)usecols)) {
- PyErr_SetString(PyExc_RuntimeError,
- "Internally a bad value was passed for usecols.");
+ Py_ssize_t num_usecols = -1;
+ Py_ssize_t *usecols = NULL;
+ if (usecols_obj != Py_None) {
+ num_usecols = PySequence_Length(usecols_obj);
+ if (num_usecols < 0) {
return NULL;
}
+ /* Calloc just to not worry about overflow */
+ usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t));
+ for (Py_ssize_t i = 0; i < num_usecols; i++) {
+ PyObject *tmp = PySequence_GetItem(usecols_obj, i);
+ if (tmp == NULL) {
+ return NULL;
+ }
+ usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
+ if (error_converting(usecols[i])) {
+ if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+ PyErr_Format(PyExc_TypeError,
+ "usecols must be an int or a sequence of ints but "
+ "it contains at least one element of type '%s'",
+ tmp->ob_type->tp_name);
+ }
+ Py_DECREF(tmp);
+ return NULL;
+ }
+ Py_DECREF(tmp);
+ }
}
stream *s;
@@ -201,12 +215,14 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
}
if (s == NULL) {
PyErr_Format(PyExc_RuntimeError, "Unable to access the file.");
+ PyMem_FREE(usecols);
return NULL;
}
- arr = _readtext_from_stream(s, &pc, usecols, skiprows, max_rows,
- converters, dtype);
+ arr = _readtext_from_stream(
+ s, &pc, num_usecols, usecols, skiprows, max_rows, converters, dtype);
stream_close(s);
+ PyMem_FREE(usecols);
return arr;
}
diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c
index 08bc80cc4..8c95ba537 100644
--- a/numpy/core/src/multiarray/textreading/rows.c
+++ b/numpy/core/src/multiarray/textreading/rows.c
@@ -31,7 +31,7 @@
*/
static PyObject **
create_conv_funcs(
- PyObject *converters, int num_fields, int32_t *usecols)
+ PyObject *converters, int num_fields, Py_ssize_t *usecols)
{
PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *));
if (conv_funcs == NULL) {
@@ -155,8 +155,8 @@ create_conv_funcs(
*/
NPY_NO_EXPORT PyArrayObject *
read_rows(stream *s,
- npy_intp max_rows, int num_field_types, field_type *field_types,
- parser_config *pconfig, int num_usecols, int *usecols,
+ npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types,
+ parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
Py_ssize_t skiplines, PyObject *converters,
PyArrayObject *data_array, PyArray_Descr *out_descr,
bool homogeneous)
@@ -187,6 +187,7 @@ read_rows(stream *s,
int actual_num_fields = -1;
if (usecols != NULL) {
actual_num_fields = num_usecols;
+ assert(num_field_types == num_usecols);
}
else if (!homogeneous) {
actual_num_fields = num_field_types;
@@ -330,9 +331,9 @@ read_rows(stream *s,
}
}
- for (int i = 0; i < actual_num_fields; ++i) {
- int f; /* The field, either 0 (if homogeneous) or i. */
- int col; /* The column as read, remapped by usecols */
+ for (Py_ssize_t i = 0; i < actual_num_fields; ++i) {
+ Py_ssize_t f; /* The field, either 0 (if homogeneous) or i. */
+ Py_ssize_t col; /* The column as read, remapped by usecols */
char *item_ptr;
if (homogeneous) {
f = 0;
diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h
index 342af0a4b..20eb9e186 100644
--- a/numpy/core/src/multiarray/textreading/rows.h
+++ b/numpy/core/src/multiarray/textreading/rows.h
@@ -13,8 +13,8 @@
NPY_NO_EXPORT PyArrayObject *
read_rows(stream *s,
- npy_intp nrows, int num_field_types, field_type *field_types,
- parser_config *pconfig, int num_usecols, int *usecols,
+ npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types,
+ parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
Py_ssize_t skiplines, PyObject *converters,
PyArrayObject *data_array, PyArray_Descr *out_descr,
bool homogeneous);
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 6e660dce1..f15f94580 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -909,25 +909,12 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
dtype = np.dtype(object)
if usecols is not None:
- # Allow usecols to be a single int or a sequence of ints
+ # Allow usecols to be a single int or a sequence of ints, the C-code
+ # handles the rest
try:
- usecols_as_list = list(usecols)
+ usecols = list(usecols)
except TypeError:
- usecols_as_list = [usecols]
- for col_idx in usecols_as_list:
- try:
- operator.index(col_idx)
- except TypeError:
- # Some unit tests for numpy.loadtxt require that the
- # error message matches this format.
- raise TypeError(
- "usecols must be an int or a sequence of ints but "
- "it contains at least one element of type %s" %
- type(col_idx),
- ) from None
- # Fall back to existing code
- usecols = np.array([operator.index(i) for i in usecols_as_list],
- dtype=np.int32)
+ usecols = [usecols]
_ensure_ndmin_ndarray_check_param(ndmin)
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 548f5e5b4..6539a3d36 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -871,16 +871,27 @@ class TestLoadTxt(LoadTxtBase):
bogus_idx = 1.5
assert_raises_regex(
TypeError,
- '^usecols must be.*%s' % type(bogus_idx),
+ '^usecols must be.*%s' % type(bogus_idx).__name__,
np.loadtxt, c, usecols=bogus_idx
)
assert_raises_regex(
TypeError,
- '^usecols must be.*%s' % type(bogus_idx),
+ '^usecols must be.*%s' % type(bogus_idx).__name__,
np.loadtxt, c, usecols=[0, bogus_idx, 0]
)
+ def test_bad_usecols(self):
+ with pytest.raises(OverflowError):
+ np.loadtxt(["1\n"], usecols=[2**64], delimiter=",")
+ with pytest.raises((ValueError, OverflowError)):
+ # Overflow error on 32bit platforms
+ np.loadtxt(["1\n"], usecols=[2**62], delimiter=",")
+ with pytest.raises(TypeError,
+ match="If a structured dtype .*. But 1 usecols were given and "
+ "the number of fields is 3."):
+ np.loadtxt(["1,1\n"], dtype="i,(2)i", usecols=[0], delimiter=",")
+
def test_fancy_dtype(self):
c = TextIO()
c.write('1,2,3.0\n4,5,6.0\n')