summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/__init__.py4
-rw-r--r--numpy/core/setup.py10
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h72
-rw-r--r--numpy/core/src/multiarray/conversion_utils.c11
-rw-r--r--numpy/core/src/multiarray/conversion_utils.h3
-rw-r--r--numpy/core/src/multiarray/multiarraymodule.c3
-rw-r--r--numpy/core/src/multiarray/textreading/conversions.c395
-rw-r--r--numpy/core/src/multiarray/textreading/conversions.h57
-rw-r--r--numpy/core/src/multiarray/textreading/field_types.c201
-rw-r--r--numpy/core/src/multiarray/textreading/field_types.h67
-rw-r--r--numpy/core/src/multiarray/textreading/growth.c47
-rw-r--r--numpy/core/src/multiarray/textreading/growth.h7
-rw-r--r--numpy/core/src/multiarray/textreading/parser_config.h61
-rw-r--r--numpy/core/src/multiarray/textreading/readtext.c312
-rw-r--r--numpy/core/src/multiarray/textreading/readtext.h7
-rw-r--r--numpy/core/src/multiarray/textreading/rows.c481
-rw-r--r--numpy/core/src/multiarray/textreading/rows.h22
-rw-r--r--numpy/core/src/multiarray/textreading/str_to_int.c67
-rw-r--r--numpy/core/src/multiarray/textreading/str_to_int.h174
-rw-r--r--numpy/core/src/multiarray/textreading/stream.h41
-rw-r--r--numpy/core/src/multiarray/textreading/stream_pyobject.c239
-rw-r--r--numpy/core/src/multiarray/textreading/stream_pyobject.h16
-rw-r--r--numpy/core/src/multiarray/textreading/tokenize.c.src457
-rw-r--r--numpy/core/src/multiarray/textreading/tokenize.h78
-rw-r--r--numpy/core/src/npysort/quicksort.c.src17
-rw-r--r--numpy/core/src/npysort/x86-qsort.dispatch.c.src587
-rw-r--r--numpy/core/src/npysort/x86-qsort.h18
-rw-r--r--numpy/core/tests/test_multiarray.py49
-rw-r--r--numpy/lib/function_base.py2
-rw-r--r--numpy/lib/npyio.py752
-rw-r--r--numpy/lib/tests/test_io.py77
-rw-r--r--numpy/lib/tests/test_loadtxt.py1002
-rw-r--r--numpy/testing/_private/utils.py2
-rw-r--r--numpy/testing/tests/test_utils.py8
34 files changed, 4999 insertions, 347 deletions
diff --git a/numpy/__init__.py b/numpy/__init__.py
index abe53fe9a..46d80fb76 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -52,8 +52,6 @@ of numpy are available under the ``doc`` sub-module::
Available subpackages
---------------------
-doc
- Topical documentation on broadcasting, indexing, etc.
lib
Basic functions used by several sub-packages.
random
@@ -66,8 +64,6 @@ polynomial
Polynomial tools
testing
NumPy testing tools
-f2py
- Fortran to Python Interface Generator.
distutils
Enhancements to distutils with support for
Fortran compilers support and more.
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index a13480907..9704cff0a 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -869,6 +869,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'typeinfo.h'),
join('src', 'multiarray', 'usertypes.h'),
join('src', 'multiarray', 'vdot.h'),
+ join('src', 'multiarray', 'textreading', 'readtext.h'),
join('include', 'numpy', 'arrayobject.h'),
join('include', 'numpy', '_neighborhood_iterator_imp.h'),
join('include', 'numpy', 'npy_endian.h'),
@@ -947,6 +948,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'usertypes.c'),
join('src', 'multiarray', 'vdot.c'),
join('src', 'common', 'npy_sort.h.src'),
+ join('src', 'npysort', 'x86-qsort.dispatch.c.src'),
join('src', 'npysort', 'quicksort.c.src'),
join('src', 'npysort', 'mergesort.cpp'),
join('src', 'npysort', 'timsort.cpp'),
@@ -956,6 +958,14 @@ def configuration(parent_package='',top_path=None):
join('src', 'npysort', 'selection.cpp'),
join('src', 'common', 'npy_binsearch.h'),
join('src', 'npysort', 'binsearch.cpp'),
+ join('src', 'multiarray', 'textreading', 'conversions.c'),
+ join('src', 'multiarray', 'textreading', 'field_types.c'),
+ join('src', 'multiarray', 'textreading', 'growth.c'),
+ join('src', 'multiarray', 'textreading', 'readtext.c'),
+ join('src', 'multiarray', 'textreading', 'rows.c'),
+ join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),
+ join('src', 'multiarray', 'textreading', 'str_to_int.c'),
+ join('src', 'multiarray', 'textreading', 'tokenize.c.src'),
]
#######################################################################
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index f8632e701..93e9d9d45 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -371,7 +371,79 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
#define npyv_sum_u64 _mm512_reduce_add_epi64
#define npyv_sum_f32 _mm512_reduce_add_ps
#define npyv_sum_f64 _mm512_reduce_add_pd
+ #define npyv_reducemin_u32 _mm512_reduce_min_epu32
+ #define npyv_reducemin_s32 _mm512_reduce_min_epi32
+ #define npyv_reducemin_f32 _mm512_reduce_min_ps
+ #define npyv_reducemax_u32 _mm512_reduce_max_epu32
+ #define npyv_reducemax_s32 _mm512_reduce_max_epi32
+ #define npyv_reducemax_f32 _mm512_reduce_max_ps
#else
+ NPY_FINLINE npy_uint32 npyv_reducemax_u32(npyv_u32 a)
+ {
+ const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+ const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+ npyv_u32 a1 = _mm512_max_epu32(a, _mm512_permutex2var_epi32(a, idx1, a));
+ npyv_u32 a2 = _mm512_max_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1));
+ npyv_u32 a3 = _mm512_max_epu32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2)));
+ npyv_u32 a4 = _mm512_max_epu32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1)));
+ return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00));
+ }
+
+ NPY_FINLINE npy_int32 npyv_reducemax_s32(npyv_s32 a)
+ {
+ const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+ const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+ npyv_s32 a1 = _mm512_max_epi32(a, _mm512_permutex2var_epi32(a, idx1, a));
+ npyv_s32 a2 = _mm512_max_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1));
+ npyv_s32 a3 = _mm512_max_epi32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2)));
+ npyv_s32 a4 = _mm512_max_epi32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1)));
+ return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00));
+ }
+
+ NPY_FINLINE npy_float npyv_reducemax_f32(npyv_f32 a)
+ {
+ const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+ const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+ npyv_f32 a1 = _mm512_max_ps(a, _mm512_permutex2var_ps(a, idx1, a));
+ npyv_f32 a2 = _mm512_max_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1));
+ npyv_f32 a3 = _mm512_max_ps(a2, _mm512_shuffle_ps(a2, a2, (1<<6 | 0<<4 | 3<<2 | 2)));
+ npyv_f32 a4 = _mm512_max_ps(a3, _mm512_shuffle_sp(a3, a3, (2<<6 | 3<<4 | 0<<2 | 1)));
+ return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00));
+ }
+
+ NPY_FINLINE npy_uint32 npyv_reducemin_u32(npyv_u32 a)
+ {
+ const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+ const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+ npyv_u32 a1 = _mm512_min_epu32(a, _mm512_permutex2var_epi32(a, idx1, a));
+ npyv_u32 a2 = _mm512_min_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1));
+ npyv_u32 a3 = _mm512_min_epu32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2)));
+ npyv_u32 a4 = _mm512_min_epu32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1)));
+ return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00));
+ }
+
+ NPY_FINLINE npy_int32 npyv_reducemin_s32(npyv_s32 a)
+ {
+ const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+ const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+ npyv_s32 a1 = _mm512_min_epi32(a, _mm512_permutex2var_epi32(a, idx1, a));
+ npyv_s32 a2 = _mm512_min_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1));
+ npyv_s32 a3 = _mm512_min_epi32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2)));
+ npyv_s32 a4 = _mm512_min_epi32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1)));
+ return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00));
+ }
+
+ NPY_FINLINE npy_float npyv_reducemin_f32(npyv_f32 a)
+ {
+ const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+ const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+ npyv_f32 a1 = _mm512_min_ps(a, _mm512_permutex2var_ps(a, idx1, a));
+ npyv_f32 a2 = _mm512_min_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1));
+ npyv_f32 a3 = _mm512_min_ps(a2, _mm512_shuffle_ps(a2, a2, (1<<6 | 0<<4 | 3<<2 | 2)));
+ npyv_f32 a4 = _mm512_min_ps(a3, _mm512_shuffle_sp(a3, a3, (2<<6 | 3<<4 | 0<<2 | 1)));
+ return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00));
+ }
+
NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
{
__m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a));
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index a1de580d9..e4eb4f49e 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -993,6 +993,17 @@ PyArray_PyIntAsIntp(PyObject *o)
}
+NPY_NO_EXPORT int
+PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val)
+{
+ *val = PyArray_PyIntAsIntp(o);
+ if (error_converting(*val)) {
+ return NPY_FAIL;
+ }
+ return NPY_SUCCEED;
+}
+
+
/*
* PyArray_IntpFromIndexSequence
* Returns the number of dimensions or -1 if an error occurred.
diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h
index 4072841ee..4d0fbb894 100644
--- a/numpy/core/src/multiarray/conversion_utils.h
+++ b/numpy/core/src/multiarray/conversion_utils.h
@@ -7,6 +7,9 @@ NPY_NO_EXPORT int
PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq);
NPY_NO_EXPORT int
+PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val);
+
+NPY_NO_EXPORT int
PyArray_OptionalIntpConverter(PyObject *obj, PyArray_Dims *seq);
typedef enum {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 789446d0c..a7b6898e1 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -69,6 +69,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
#include "get_attr_string.h"
#include "experimental_public_dtype_api.h" /* _get_experimental_dtype_api */
+#include "textreading/readtext.h" /* _readtext_from_file_object */
#include "npy_dlpack.h"
@@ -4456,6 +4457,8 @@ static struct PyMethodDef array_module_methods[] = {
METH_VARARGS | METH_KEYWORDS, NULL},
{"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api,
METH_O, NULL},
+ {"_load_from_filelike", (PyCFunction)_load_from_filelike,
+ METH_FASTCALL | METH_KEYWORDS, NULL},
/* from umath */
{"frompyfunc",
(PyCFunction) ufunc_frompyfunc,
diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c
new file mode 100644
index 000000000..11f4210f7
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/conversions.c
@@ -0,0 +1,395 @@
+
+#include <Python.h>
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "lowlevel_strided_loops.h"
+
+#include "conversions.h"
+#include "str_to_int.h"
+
+#include "array_coercion.h"
+
+
+/*
+ * Coercion to boolean is done via integer right now.
+ */
+NPY_NO_EXPORT int
+to_bool(PyArray_Descr *NPY_UNUSED(descr),
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(pconfig))
+{
+ int64_t res;
+ if (str_to_int64(str, end, INT64_MIN, INT64_MAX, &res) < 0) {
+ return -1;
+ }
+ *dataptr = (char)(res != 0);
+ return 0;
+}
+
+
+/*
+ * In order to not pack a whole copy of a floating point parser, we copy the
+ * result into ascii and call the Python one. Float parsing isn't super quick
+ * so this is not terrible, but avoiding it would speed up things.
+ *
+ * Also note that parsing the first float of a complex will copy the whole
+ * string to ascii rather than just the first part.
+ * TODO: A tweak of the break might be a simple mitigation there.
+ *
+ * @param str The UCS4 string to parse
+ * @param end Pointer to the end of the string
+ * @param skip_trailing_whitespace If false does not skip trailing whitespace
+ * (used by the complex parser).
+ * @param result Output stored as double value.
+ */
+static NPY_INLINE int
+double_from_ucs4(
+ const Py_UCS4 *str, const Py_UCS4 *end,
+ bool strip_whitespace, double *result, const Py_UCS4 **p_end)
+{
+ /* skip leading whitespace */
+ if (strip_whitespace) {
+ while (Py_UNICODE_ISSPACE(*str)) {
+ str++;
+ }
+ }
+ if (str == end) {
+ return -1; /* empty or only whitespace: not a floating point number */
+ }
+
+ /* We convert to ASCII for the Python parser, use stack if small: */
+ char stack_buf[128];
+ char *heap_buf = NULL;
+ char *ascii = stack_buf;
+
+ size_t str_len = end - str + 1;
+ if (str_len > 128) {
+ heap_buf = PyMem_MALLOC(str_len);
+ if (heap_buf == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ ascii = heap_buf;
+ }
+ char *c = ascii;
+ for (; str < end; str++, c++) {
+ if (NPY_UNLIKELY(*str >= 128)) {
+ /* Character cannot be used, ignore for end calculation and stop */
+ end = str;
+ break;
+ }
+ *c = (char)(*str);
+ }
+ *c = '\0';
+
+ char *end_parsed;
+ *result = PyOS_string_to_double(ascii, &end_parsed, NULL);
+ /* Rewind `end` to the first UCS4 character not parsed: */
+ end = end - (c - end_parsed);
+
+ PyMem_FREE(heap_buf);
+
+ if (*result == -1. && PyErr_Occurred()) {
+ return -1;
+ }
+
+ if (strip_whitespace) {
+ /* and then skip any remainig whitespace: */
+ while (Py_UNICODE_ISSPACE(*end)) {
+ end++;
+ }
+ }
+ *p_end = end;
+ return 0;
+}
+
+
+NPY_NO_EXPORT int
+to_float(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(pconfig))
+{
+ double double_val;
+ const Py_UCS4 *p_end;
+ if (double_from_ucs4(str, end, true, &double_val, &p_end) < 0) {
+ return -1;
+ }
+ if (p_end != end) {
+ return -1;
+ }
+
+ float val = (float)double_val;
+ memcpy(dataptr, &val, sizeof(float));
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ npy_bswap4_unaligned(dataptr);
+ }
+ return 0;
+}
+
+
+NPY_NO_EXPORT int
+to_double(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(pconfig))
+{
+ double val;
+ const Py_UCS4 *p_end;
+ if (double_from_ucs4(str, end, true, &val, &p_end) < 0) {
+ return -1;
+ }
+ if (p_end != end) {
+ return -1;
+ }
+
+ memcpy(dataptr, &val, sizeof(double));
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ npy_bswap8_unaligned(dataptr);
+ }
+ return 0;
+}
+
+
+static bool
+to_complex_int(
+ const Py_UCS4 *item, const Py_UCS4 *token_end,
+ double *p_real, double *p_imag,
+ Py_UCS4 imaginary_unit, bool allow_parens)
+{
+ const Py_UCS4 *p_end;
+ bool unmatched_opening_paren = false;
+
+ /* Remove whitespace before the possibly leading '(' */
+ while (Py_UNICODE_ISSPACE(*item)) {
+ ++item;
+ }
+ if (allow_parens && (*item == '(')) {
+ unmatched_opening_paren = true;
+ ++item;
+ /* Allow whitespace within the parentheses: "( 1j)" */
+ while (Py_UNICODE_ISSPACE(*item)) {
+ ++item;
+ }
+ }
+ if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) {
+ return false;
+ }
+ if (p_end == token_end) {
+ // No imaginary part in the string (e.g. "3.5")
+ *p_imag = 0.0;
+ return !unmatched_opening_paren;
+ }
+ if (*p_end == imaginary_unit) {
+ /* Only an imaginary part (e.g "1.5j") */
+ *p_imag = *p_real;
+ *p_real = 0.0;
+ ++p_end;
+ }
+ else if (*p_end == '+' || *p_end == '-') {
+ /* Imaginary part still to parse */
+ if (*p_end == '+') {
+ ++p_end; /* Advance to support +- (and ++) */
+ }
+ if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) {
+ return false;
+ }
+ if (*p_end != imaginary_unit) {
+ return false;
+ }
+ ++p_end;
+ }
+ else {
+ *p_imag = 0;
+ }
+
+ if (unmatched_opening_paren) {
+ /* Allow whitespace inside brackets as in "(1+2j )" or "( 1j )" */
+ while (Py_UNICODE_ISSPACE(*p_end)) {
+ ++p_end;
+ }
+ if (*p_end == ')') {
+ ++p_end;
+ }
+ else {
+ /* parentheses was not closed */
+ return false;
+ }
+ }
+
+ while (Py_UNICODE_ISSPACE(*p_end)) {
+ ++p_end;
+ }
+ return p_end == token_end;
+}
+
+
+NPY_NO_EXPORT int
+to_cfloat(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig)
+{
+ double real;
+ double imag;
+
+ bool success = to_complex_int(
+ str, end, &real, &imag,
+ pconfig->imaginary_unit, true);
+
+ if (!success) {
+ return -1;
+ }
+ npy_complex64 val = {(float)real, (float)imag};
+ memcpy(dataptr, &val, sizeof(npy_complex64));
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ npy_bswap4_unaligned(dataptr);
+ npy_bswap4_unaligned(dataptr + 4);
+ }
+ return 0;
+}
+
+
+NPY_NO_EXPORT int
+to_cdouble(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig)
+{
+ double real;
+ double imag;
+
+ bool success = to_complex_int(
+ str, end, &real, &imag, pconfig->imaginary_unit, true);
+
+ if (!success) {
+ return -1;
+ }
+ npy_complex128 val = {real, imag};
+ memcpy(dataptr, &val, sizeof(npy_complex128));
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ npy_bswap8_unaligned(dataptr);
+ npy_bswap8_unaligned(dataptr + 8);
+ }
+ return 0;
+}
+
+
+/*
+ * String and unicode conversion functions.
+ */
+NPY_NO_EXPORT int
+to_string(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(unused))
+{
+ const Py_UCS4* c = str;
+ size_t length = descr->elsize;
+
+ for (size_t i = 0; i < length; i++) {
+ if (c < end) {
+ /*
+ * loadtxt assumed latin1, which is compatible with UCS1 (first
+ * 256 unicode characters).
+ */
+ if (NPY_UNLIKELY(*c > 255)) {
+ /* TODO: Was UnicodeDecodeError, is unspecific error good? */
+ return -1;
+ }
+ dataptr[i] = (Py_UCS1)(*c);
+ c++;
+ }
+ else {
+ dataptr[i] = '\0';
+ }
+ }
+ return 0;
+}
+
+
+NPY_NO_EXPORT int
+to_unicode(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *NPY_UNUSED(unused))
+{
+ int length = descr->elsize / 4;
+
+ if (length <= end - str) {
+ memcpy(dataptr, str, length * 4);
+ }
+ else {
+ size_t given_len = end - str;
+ memcpy(dataptr, str, given_len * 4);
+ memset(dataptr + given_len * 4, '\0', (length - given_len) * 4);
+ }
+
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ for (int i = 0; i < length; i++) {
+ npy_bswap4_unaligned(dataptr);
+ dataptr += 4;
+ }
+ }
+ return 0;
+}
+
+
+
+/*
+ * Convert functions helper for the generic converter.
+ */
+static PyObject *
+call_converter_function(
+ PyObject *func, const Py_UCS4 *str, size_t length, bool byte_converters)
+{
+ PyObject *s = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str, length);
+ if (s == NULL) {
+ return s;
+ }
+ if (byte_converters) {
+ Py_SETREF(s, PyUnicode_AsEncodedString(s, "latin1", NULL));
+ if (s == NULL) {
+ return NULL;
+ }
+ }
+ if (func == NULL) {
+ return s;
+ }
+ PyObject *result = PyObject_CallFunctionObjArgs(func, s, NULL);
+ Py_DECREF(s);
+ return result;
+}
+
+
+NPY_NO_EXPORT int
+to_generic_with_converter(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *config, PyObject *func)
+{
+ bool use_byte_converter;
+ if (func == NULL) {
+ use_byte_converter = config->c_byte_converters;
+ }
+ else {
+ use_byte_converter = config->python_byte_converters;
+ }
+ /* Converts to unicode and calls custom converter (if set) */
+ PyObject *converted = call_converter_function(
+ func, str, (size_t)(end - str), use_byte_converter);
+ if (converted == NULL) {
+ return -1;
+ }
+
+ int res = PyArray_Pack(descr, dataptr, converted);
+ Py_DECREF(converted);
+ return res;
+}
+
+
+NPY_NO_EXPORT int
+to_generic(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *config)
+{
+ return to_generic_with_converter(descr, str, end, dataptr, config, NULL);
+}
diff --git a/numpy/core/src/multiarray/textreading/conversions.h b/numpy/core/src/multiarray/textreading/conversions.h
new file mode 100644
index 000000000..222eea4e7
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/conversions.h
@@ -0,0 +1,57 @@
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_
+
+#include <stdbool.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+
+#include "textreading/parser_config.h"
+
+NPY_NO_EXPORT int
+to_bool(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+NPY_NO_EXPORT int
+to_float(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+NPY_NO_EXPORT int
+to_double(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+NPY_NO_EXPORT int
+to_cfloat(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+NPY_NO_EXPORT int
+to_cdouble(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+NPY_NO_EXPORT int
+to_string(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *unused);
+
+NPY_NO_EXPORT int
+to_unicode(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *unused);
+
+NPY_NO_EXPORT int
+to_generic_with_converter(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *unused, PyObject *func);
+
+NPY_NO_EXPORT int
+to_generic(PyArray_Descr *descr,
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
+ parser_config *pconfig);
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ */
diff --git a/numpy/core/src/multiarray/textreading/field_types.c b/numpy/core/src/multiarray/textreading/field_types.c
new file mode 100644
index 000000000..0722efd57
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/field_types.c
@@ -0,0 +1,201 @@
+#include "field_types.h"
+#include "conversions.h"
+#include "str_to_int.h"
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/ndarraytypes.h"
+#include "alloc.h"
+
+#include "textreading/growth.h"
+
+
+NPY_NO_EXPORT void
+field_types_xclear(int num_field_types, field_type *ft) {
+ assert(num_field_types >= 0);
+ if (ft == NULL) {
+ return;
+ }
+ for (int i = 0; i < num_field_types; i++) {
+ Py_XDECREF(ft[i].descr);
+ ft[i].descr = NULL;
+ }
+ PyMem_Free(ft);
+}
+
+
+/*
+ * Fetch custom converters for the builtin NumPy DTypes (or the generic one).
+ * Structured DTypes get unpacked and `object` uses the generic method.
+ *
+ * TODO: This should probably be moved on the DType object in some form,
+ * to allow user DTypes to define their own converters.
+ */
+static set_from_ucs4_function *
+get_from_ucs4_function(PyArray_Descr *descr)
+{
+ if (descr->type_num == NPY_BOOL) {
+ return &to_bool;
+ }
+ else if (PyDataType_ISSIGNED(descr)) {
+ switch (descr->elsize) {
+ case 1:
+ return &to_int8;
+ case 2:
+ return &to_int16;
+ case 4:
+ return &to_int32;
+ case 8:
+ return &to_int64;
+ default:
+ assert(0);
+ }
+ }
+ else if (PyDataType_ISUNSIGNED(descr)) {
+ switch (descr->elsize) {
+ case 1:
+ return &to_uint8;
+ case 2:
+ return &to_uint16;
+ case 4:
+ return &to_uint32;
+ case 8:
+ return &to_uint64;
+ default:
+ assert(0);
+ }
+ }
+ else if (descr->type_num == NPY_FLOAT) {
+ return &to_float;
+ }
+ else if (descr->type_num == NPY_DOUBLE) {
+ return &to_double;
+ }
+ else if (descr->type_num == NPY_CFLOAT) {
+ return &to_cfloat;
+ }
+ else if (descr->type_num == NPY_CDOUBLE) {
+ return &to_cdouble;
+ }
+ else if (descr->type_num == NPY_STRING) {
+ return &to_string;
+ }
+ else if (descr->type_num == NPY_UNICODE) {
+ return &to_unicode;
+ }
+ return &to_generic;
+}
+
+
+/*
+ * Note that the function cleans up `ft` on error. If `num_field_types < 0`
+ * cleanup has already happened in the internal call.
+ */
+static npy_intp
+field_type_grow_recursive(PyArray_Descr *descr,
+ npy_intp num_field_types, field_type **ft, npy_intp *ft_size,
+ npy_intp field_offset)
+{
+ if (PyDataType_HASSUBARRAY(descr)) {
+ PyArray_Dims shape = {NULL, -1};
+
+ if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) {
+ PyErr_SetString(PyExc_ValueError, "invalid subarray shape");
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ npy_intp size = PyArray_MultiplyList(shape.ptr, shape.len);
+ npy_free_cache_dim_obj(shape);
+ for (npy_intp i = 0; i < size; i++) {
+ num_field_types = field_type_grow_recursive(descr->subarray->base,
+ num_field_types, ft, ft_size, field_offset);
+ field_offset += descr->subarray->base->elsize;
+ if (num_field_types < 0) {
+ return -1;
+ }
+ }
+ return num_field_types;
+ }
+ else if (PyDataType_HASFIELDS(descr)) {
+ npy_int num_descr_fields = PyTuple_Size(descr->names);
+ if (num_descr_fields < 0) {
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ for (npy_intp i = 0; i < num_descr_fields; i++) {
+ PyObject *key = PyTuple_GET_ITEM(descr->names, i);
+ PyObject *tup = PyObject_GetItem(descr->fields, key);
+ if (tup == NULL) {
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ PyArray_Descr *field_descr;
+ PyObject *title;
+ int offset;
+ if (!PyArg_ParseTuple(tup, "Oi|O", &field_descr, &offset, &title)) {
+ Py_DECREF(tup);
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ Py_DECREF(tup);
+ num_field_types = field_type_grow_recursive(
+ field_descr, num_field_types, ft, ft_size,
+ field_offset + offset);
+ if (num_field_types < 0) {
+ return -1;
+ }
+ }
+ return num_field_types;
+ }
+
+ if (*ft_size <= num_field_types) {
+ npy_intp alloc_size = grow_size_and_multiply(
+ ft_size, 4, sizeof(field_type));
+ if (alloc_size < 0) {
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ field_type *new_ft = PyMem_Realloc(*ft, alloc_size);
+ if (new_ft == NULL) {
+ field_types_xclear(num_field_types, *ft);
+ return -1;
+ }
+ *ft = new_ft;
+ }
+
+ Py_INCREF(descr);
+ (*ft)[num_field_types].descr = descr;
+ (*ft)[num_field_types].set_from_ucs4 = get_from_ucs4_function(descr);
+ (*ft)[num_field_types].structured_offset = field_offset;
+
+ return num_field_types + 1;
+}
+
+
+/*
+ * Prepare the "field_types" for the given dtypes/descriptors. Currently,
+ * we copy the itemsize, but the main thing is that we check for custom
+ * converters.
+ */
+NPY_NO_EXPORT npy_intp
+field_types_create(PyArray_Descr *descr, field_type **ft)
+{
+ if (descr->subarray != NULL) {
+ /*
+ * This could probably be allowed, but NumPy absorbs the dimensions
+ * so it is an awkward corner case that probably never really worked.
+ */
+ PyErr_SetString(PyExc_TypeError,
+ "file reader does not support subarray dtypes. You can"
+ "put the dtype into a structured one using "
+ "`np.dtype(('name', dtype))` to avoid this limitation.");
+ return -1;
+ }
+
+ npy_intp ft_size = 4;
+ *ft = PyMem_Malloc(ft_size * sizeof(field_type));
+ if (*ft == NULL) {
+ return -1;
+ }
+ return field_type_grow_recursive(descr, 0, ft, &ft_size, 0);
+}
diff --git a/numpy/core/src/multiarray/textreading/field_types.h b/numpy/core/src/multiarray/textreading/field_types.h
new file mode 100644
index 000000000..f26e00a5e
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/field_types.h
@@ -0,0 +1,67 @@
+
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/ndarraytypes.h"
+
+#include "textreading/parser_config.h"
+
+/**
+ * Function defining the conversion for each value.
+ *
+ * This function must support unaligned memory access. As of now, there is
+ * no special error handling (in whatever form): We assume that it is always
+ * reasonable to raise a `ValueError` noting the string that failed to be
+ * converted.
+ *
+ * NOTE: An earlier version of the code had unused default values (pandas
+ * does this) when columns are missing. We could define this either
+ * by passing `NULL` in, or by adding a default explicitly somewhere.
+ * (I think users should probably have to define the default, at which
+ * point it doesn't matter here.)
+ *
+ * NOTE: We are currently passing the parser config, this could be made public
+ * or could be set up to be dtype specific/private. Always passing
+ * pconfig fully seems easier right now even if it may change.
+ * (A future use-case may for example be user-specified strings that are
+ * considered boolean True or False).
+ *
+ * TODO: Aside from nailing down the above notes, it may be nice to expose
+ * these function publically. This could allow user DTypes to provide
+ * a converter or custom converters written in C rather than Python.
+ *
+ * @param descr The NumPy descriptor of the field (may be byte-swapped, etc.)
+ * @param str Pointer to the beginning of the UCS4 string to be parsed.
+ * @param end Pointer to the end of the UCS4 string. This value is currently
+ * guaranteed to be `\0`, ensuring that parsers can rely on
+ * nul-termination.
+ * @param dataptr The pointer where to store the parsed value
+ * @param pconfig Additional configuration for the parser.
+ * @returns 0 on success and -1 on failure. If the return value is -1 an
+ * error may or may not be set. If an error is set, it is chained
+ * behind the generic ValueError.
+ */
+typedef int (set_from_ucs4_function)(
+ PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end,
+ char *dataptr, parser_config *pconfig);
+
+typedef struct _field_type {
+ set_from_ucs4_function *set_from_ucs4;
+ /* The original NumPy descriptor */
+ PyArray_Descr *descr;
+ /* Offset to this entry within row. */
+ npy_intp structured_offset;
+} field_type;
+
+
+NPY_NO_EXPORT void
+field_types_xclear(int num_field_types, field_type *ft);
+
+NPY_NO_EXPORT npy_intp
+field_types_create(PyArray_Descr *descr, field_type **ft);
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ */
diff --git a/numpy/core/src/multiarray/textreading/growth.c b/numpy/core/src/multiarray/textreading/growth.c
new file mode 100644
index 000000000..49a09d572
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/growth.c
@@ -0,0 +1,47 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/ndarraytypes.h"
+
+#include "templ_common.h"
+
+/*
+ * Helper function taking the size input and growing it (based on min_grow).
+ * The current scheme is a minimum growth and a general growth by 25%
+ * overallocation. This is then capped at 2**20 elements, as that propels us
+ * in the range of large page sizes (so it is presumably more than enough).
+ *
+ * It further multiplies it with `itemsize` and ensures that all results fit
+ * into an `npy_intp`.
+ * Returns -1 if any overflow occurred or the result would not fit.
+ * The user has to ensure the input is ssize_t but not negative.
+ */
+NPY_NO_EXPORT npy_intp
+grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize) {
+ /* min_grow must be a power of two: */
+ assert((min_grow & (min_grow - 1)) == 0);
+ npy_uintp new_size = (npy_uintp)*size;
+ npy_intp growth = *size >> 2;
+ if (growth <= min_grow) {
+ /* can never lead to overflow if we are using min_growth */
+ new_size += min_grow;
+ }
+ else {
+ if (growth > 1 << 20) {
+ /* limit growth to order of MiB (even hugepages are not larger) */
+ growth = 1 << 20;
+ }
+ new_size += growth + min_grow - 1;
+ new_size &= ~min_grow;
+
+ if (new_size > NPY_MAX_INTP) {
+ return -1;
+ }
+ }
+ *size = (npy_intp)new_size;
+ npy_intp alloc_size;
+ if (npy_mul_with_overflow_intp(&alloc_size, (npy_intp)new_size, itemsize)) {
+ return -1;
+ }
+ return alloc_size;
+}
+
diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h
new file mode 100644
index 000000000..237b77ad3
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/growth.h
@@ -0,0 +1,7 @@
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_
+
+NPY_NO_EXPORT npy_intp
+grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize);
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ */
diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h
new file mode 100644
index 000000000..00e911667
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/parser_config.h
@@ -0,0 +1,61 @@
+
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_
+
+#include <stdbool.h>
+
+typedef struct {
+ /*
+ * Field delimiter character.
+ * Typically ',', ' ', '\t', ignored if `delimiter_is_whitespace` is true.
+ */
+ Py_UCS4 delimiter;
+
+ /*
+ * Character used to quote fields.
+ * Typically '"' or "'". To disable quoting we set this to UINT_MAX
+ * (which is not a valid unicode character and thus cannot occur in the
+ * file; the same is used for all other characters if necessary).
+ */
+ Py_UCS4 quote;
+
+ /*
+ * Character(s) that indicates the start of a comment.
+ * Typically '#', '%' or ';'.
+ * When encountered in a line and not inside quotes, all character
+ * from the comment character(s) to the end of the line are ignored.
+ */
+ Py_UCS4 comment;
+
+ /*
+ * Ignore whitespace at the beginning of a field (outside/before quotes).
+ * Is (and must be) set if `delimiter_is_whitespace`.
+ */
+ bool ignore_leading_whitespace;
+
+ /*
+ * If true, the delimiter is ignored and any unicode whitespace is used
+ * for splitting (same as `string.split()` in Python). In that case
+ * `ignore_leading_whitespace` should also be set.
+ */
+ bool delimiter_is_whitespace;
+
+ /*
+ * The imaginary unit character. Default is `j`.
+ */
+ Py_UCS4 imaginary_unit;
+
+ /*
+ * Data should be encoded as `latin1` when using python converter
+ * (implementing `loadtxt` default Python 2 compatibility mode).
+ * The c byte converter is used when the user requested `dtype="S"`.
+ * In this case we go via `dtype=object`, however, loadtxt allows latin1
+ * while normal object to string casts only accept ASCII, so it ensures
+ * that that the object array already contains bytes and not strings.
+ */
+ bool python_byte_converters;
+ bool c_byte_converters;
+} parser_config;
+
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ */
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
new file mode 100644
index 000000000..7af5ee891
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -0,0 +1,312 @@
+#include <stdio.h>
+#include <stdbool.h>
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+#include "npy_argparse.h"
+#include "common.h"
+#include "conversion_utils.h"
+
+#include "textreading/parser_config.h"
+#include "textreading/stream_pyobject.h"
+#include "textreading/field_types.h"
+#include "textreading/rows.h"
+#include "textreading/str_to_int.h"
+
+
+//
+// `usecols` must point to a Python object that is Py_None or a 1-d contiguous
+// numpy array with data type int32.
+//
+// `dtype` must point to a Python object that is Py_None or a numpy dtype
+// instance. If the latter, code and sizes must be arrays of length
+// num_dtype_fields, holding the flattened data field type codes and byte
+// sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype,
+// but we do that in Python code.)
+//
+// If both `usecols` and `dtype` are not None, and the data type is compound,
+// then len(usecols) must equal num_dtype_fields.
+//
+// If `dtype` is given and it is compound, and `usecols` is None, then the
+// number of columns in the file must match the number of fields in `dtype`.
+//
+static PyObject *
+_readtext_from_stream(stream *s,
+ parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[],
+ Py_ssize_t skiplines, Py_ssize_t max_rows,
+ PyObject *converters, PyObject *dtype)
+{
+ PyArrayObject *arr = NULL;
+ PyArray_Descr *out_dtype = NULL;
+ field_type *ft = NULL;
+
+ /*
+ * If dtypes[0] is dtype the input was not structured and the result
+ * is considered "homogeneous" and we have to discover the number of
+ * columns/
+ */
+ out_dtype = (PyArray_Descr *)dtype;
+ Py_INCREF(out_dtype);
+
+ Py_ssize_t num_fields = field_types_create(out_dtype, &ft);
+ if (num_fields < 0) {
+ goto finish;
+ }
+ bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype;
+
+ if (!homogeneous && usecols != NULL && num_usecols != num_fields) {
+ PyErr_Format(PyExc_TypeError,
+ "If a structured dtype is used, the number of columns in "
+ "`usecols` must match the effective number of fields. "
+ "But %zd usecols were given and the number of fields is %zd.",
+ num_usecols, num_fields);
+ goto finish;
+ }
+
+ arr = read_rows(
+ s, max_rows, num_fields, ft, pc,
+ num_usecols, usecols, skiplines, converters,
+ NULL, out_dtype, homogeneous);
+ if (arr == NULL) {
+ goto finish;
+ }
+
+ finish:
+ Py_XDECREF(out_dtype);
+ field_types_xclear(num_fields, ft);
+ return (PyObject *)arr;
+}
+
+
+static int
+parse_control_character(PyObject *obj, Py_UCS4 *character)
+{
+ if (obj == Py_None) {
+ *character = (Py_UCS4)-1; /* character beyond unicode range */
+ return 1;
+ }
+ if (!PyUnicode_Check(obj) || PyUnicode_GetLength(obj) != 1) {
+ PyErr_Format(PyExc_TypeError,
+ "Text reading control character must be a single unicode "
+ "character or None; but got: %.100R", obj);
+ return 0;
+ }
+ *character = PyUnicode_READ_CHAR(obj, 0);
+ return 1;
+}
+
+
+/*
+ * A (somewhat verbose) check that none of the control characters match or are
+ * newline. Most of these combinations are completely fine, just weird or
+ * surprising.
+ * (I.e. there is an implicit priority for control characters, so if a comment
+ * matches a delimiter, it would just be a comment.)
+ * In theory some `delimiter=None` paths could have a "meaning", but let us
+ * assume that users are better of setting one of the control chars to `None`
+ * for clarity.
+ *
+ * This also checks that the control characters cannot be newlines.
+ */
+static int
+error_if_matching_control_characters(
+ Py_UCS4 delimiter, Py_UCS4 quote, Py_UCS4 comment)
+{
+ char *control_char1;
+ char *control_char2 = NULL;
+ if (comment != (Py_UCS4)-1) {
+ control_char1 = "comment";
+ if (comment == '\r' || comment == '\n') {
+ goto error;
+ }
+ else if (comment == quote) {
+ control_char2 = "quotechar";
+ goto error;
+ }
+ else if (comment == delimiter) {
+ control_char2 = "delimiter";
+ goto error;
+ }
+ }
+ if (quote != (Py_UCS4)-1) {
+ control_char1 = "quotechar";
+ if (quote == '\r' || quote == '\n') {
+ goto error;
+ }
+ else if (quote == delimiter) {
+ control_char2 = "delimiter";
+ goto error;
+ }
+ }
+ if (delimiter != (Py_UCS4)-1) {
+ control_char1 = "delimiter";
+ if (delimiter == '\r' || delimiter == '\n') {
+ goto error;
+ }
+ }
+ /* The above doesn't work with delimiter=None, which means "whitespace" */
+ if (delimiter == (Py_UCS4)-1) {
+ control_char1 = "delimiter";
+ if (Py_UNICODE_ISSPACE(comment)) {
+ control_char2 = "comment";
+ goto error;
+ }
+ else if (Py_UNICODE_ISSPACE(quote)) {
+ control_char2 = "quotechar";
+ goto error;
+ }
+ }
+ return 0;
+
+ error:
+ if (control_char2 != NULL) {
+ PyErr_Format(PyExc_TypeError,
+ "The values for control characters '%s' and '%s' are "
+ "incompatible",
+ control_char1, control_char2);
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "control character '%s' cannot be a newline (`\\r` or `\\n`).",
+ control_char1, control_char2);
+ }
+ return -1;
+}
+
+
+NPY_NO_EXPORT PyObject *
+_load_from_filelike(PyObject *NPY_UNUSED(mod),
+ PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+ PyObject *file;
+ Py_ssize_t skiplines = 0;
+ Py_ssize_t max_rows = -1;
+ PyObject *usecols_obj = Py_None;
+ PyObject *converters = Py_None;
+
+ PyObject *dtype = Py_None;
+ PyObject *encoding_obj = Py_None;
+ const char *encoding = NULL;
+
+ parser_config pc = {
+ .delimiter = ',',
+ .comment = '#',
+ .quote = '"',
+ .imaginary_unit = 'j',
+ .delimiter_is_whitespace = false,
+ .ignore_leading_whitespace = false,
+ .python_byte_converters = false,
+ .c_byte_converters = false,
+ };
+ bool filelike = true;
+
+ PyObject *arr = NULL;
+
+ NPY_PREPARE_ARGPARSER;
+ if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames,
+ "file", NULL, &file,
+ "|delimiter", &parse_control_character, &pc.delimiter,
+ "|comment", &parse_control_character, &pc.comment,
+ "|quote", &parse_control_character, &pc.quote,
+ "|imaginary_unit", &parse_control_character, &pc.imaginary_unit,
+ "|usecols", NULL, &usecols_obj,
+ "|skiplines", &PyArray_IntpFromPyIntConverter, &skiplines,
+ "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows,
+ "|converters", NULL, &converters,
+ "|dtype", NULL, &dtype,
+ "|encoding", NULL, &encoding_obj,
+ "|filelike", &PyArray_BoolConverter, &filelike,
+ "|byte_converters", &PyArray_BoolConverter, &pc.python_byte_converters,
+ "|c_byte_converters", PyArray_BoolConverter, &pc.c_byte_converters,
+ NULL, NULL, NULL) < 0) {
+ return NULL;
+ }
+
+ /* Reject matching control characters, they just rarely make sense anyway */
+ if (error_if_matching_control_characters(
+ pc.delimiter, pc.quote, pc.comment) < 0) {
+ return NULL;
+ }
+
+ if (pc.delimiter == (Py_UCS4)-1) {
+ pc.delimiter_is_whitespace = true;
+ /* Ignore leading whitespace to match `string.split(None)` */
+ pc.ignore_leading_whitespace = true;
+ }
+
+ if (!PyArray_DescrCheck(dtype) ) {
+ PyErr_SetString(PyExc_TypeError,
+ "internal error: dtype must be provided and be a NumPy dtype");
+ return NULL;
+ }
+
+ if (encoding_obj != Py_None) {
+ if (!PyUnicode_Check(encoding_obj)) {
+ PyErr_SetString(PyExc_TypeError,
+ "encoding must be a unicode string.");
+ return NULL;
+ }
+ encoding = PyUnicode_AsUTF8(encoding_obj);
+ if (encoding == NULL) {
+ return NULL;
+ }
+ }
+
+ /*
+ * Parse usecols, the rest of NumPy has no clear helper for this, so do
+ * it here manually.
+ */
+ Py_ssize_t num_usecols = -1;
+ Py_ssize_t *usecols = NULL;
+ if (usecols_obj != Py_None) {
+ num_usecols = PySequence_Length(usecols_obj);
+ if (num_usecols < 0) {
+ return NULL;
+ }
+ /* Calloc just to not worry about overflow */
+ usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t));
+ for (Py_ssize_t i = 0; i < num_usecols; i++) {
+ PyObject *tmp = PySequence_GetItem(usecols_obj, i);
+ if (tmp == NULL) {
+ PyMem_FREE(usecols);
+ return NULL;
+ }
+ usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
+ if (error_converting(usecols[i])) {
+ if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+ PyErr_Format(PyExc_TypeError,
+ "usecols must be an int or a sequence of ints but "
+ "it contains at least one element of type '%s'",
+ Py_TYPE(tmp)->tp_name);
+ }
+ Py_DECREF(tmp);
+ PyMem_FREE(usecols);
+ return NULL;
+ }
+ Py_DECREF(tmp);
+ }
+ }
+
+ stream *s;
+ if (filelike) {
+ s = stream_python_file(file, encoding);
+ }
+ else {
+ s = stream_python_iterable(file, encoding);
+ }
+ if (s == NULL) {
+ PyMem_FREE(usecols);
+ return NULL;
+ }
+
+ arr = _readtext_from_stream(
+ s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype);
+ stream_close(s);
+ PyMem_FREE(usecols);
+ return arr;
+}
+
diff --git a/numpy/core/src/multiarray/textreading/readtext.h b/numpy/core/src/multiarray/textreading/readtext.h
new file mode 100644
index 000000000..5cf48c555
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/readtext.h
@@ -0,0 +1,7 @@
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_
+
+NPY_NO_EXPORT PyObject *
+_load_from_filelike(PyObject *self, PyObject *args, PyObject *kwargs);
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ */
diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c
new file mode 100644
index 000000000..e30ff835e
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/rows.c
@@ -0,0 +1,481 @@
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+#include "numpy/npy_3kcompat.h"
+#include "alloc.h"
+
+#include <string.h>
+#include <stdbool.h>
+
+#include "textreading/stream.h"
+#include "textreading/tokenize.h"
+#include "textreading/conversions.h"
+#include "textreading/field_types.h"
+#include "textreading/rows.h"
+#include "textreading/growth.h"
+
+/*
+ * Minimum size to grow the allcoation by (or 25%). The 8KiB means the actual
+ * growths is within `8 KiB <= size < 16 KiB` (depending on the row size).
+ */
+#define MIN_BLOCK_SIZE (1 << 13)
+
+
+
+/*
+ * Create the array of converter functions from the Python converters.
+ */
+static PyObject **
+create_conv_funcs(
+ PyObject *converters, Py_ssize_t num_fields, const Py_ssize_t *usecols)
+{
+ assert(converters != Py_None);
+
+ PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *));
+ if (conv_funcs == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ if (PyCallable_Check(converters)) {
+ /* a single converter used for all columns individually */
+ for (Py_ssize_t i = 0; i < num_fields; i++) {
+ Py_INCREF(converters);
+ conv_funcs[i] = converters;
+ }
+ return conv_funcs;
+ }
+ else if (!PyDict_Check(converters)) {
+ PyErr_SetString(PyExc_TypeError,
+ "converters must be a dictionary mapping columns to converter "
+ "functions or a single callable.");
+ goto error;
+ }
+
+ PyObject *key, *value;
+ Py_ssize_t pos = 0;
+ while (PyDict_Next(converters, &pos, &key, &value)) {
+ Py_ssize_t column = PyNumber_AsSsize_t(key, PyExc_IndexError);
+ if (column == -1 && PyErr_Occurred()) {
+ PyErr_Format(PyExc_TypeError,
+ "keys of the converters dictionary must be integers; "
+ "got %.100R", key);
+ goto error;
+ }
+ if (usecols != NULL) {
+ /*
+ * This code searches for the corresponding usecol. It is
+ * identical to the legacy usecols code, which has two weaknesses:
+ * 1. It fails for duplicated usecols only setting converter for
+ * the first one.
+ * 2. It fails e.g. if usecols uses negative indexing and
+ * converters does not. (This is a feature, since it allows
+ * us to correctly normalize converters to result column here.)
+ */
+ Py_ssize_t i = 0;
+ for (; i < num_fields; i++) {
+ if (column == usecols[i]) {
+ column = i;
+ break;
+ }
+ }
+ if (i == num_fields) {
+ continue; /* ignore unused converter */
+ }
+ }
+ else {
+ if (column < -num_fields || column >= num_fields) {
+ PyErr_Format(PyExc_ValueError,
+ "converter specified for column %zd, which is invalid "
+ "for the number of fields %d.", column, num_fields);
+ goto error;
+ }
+ if (column < 0) {
+ column += num_fields;
+ }
+ }
+ if (!PyCallable_Check(value)) {
+ PyErr_Format(PyExc_TypeError,
+ "values of the converters dictionary must be callable, "
+ "but the value associated with key %R is not", key);
+ goto error;
+ }
+ Py_INCREF(value);
+ conv_funcs[column] = value;
+ }
+ return conv_funcs;
+
+ error:
+ for (Py_ssize_t i = 0; i < num_fields; i++) {
+ Py_XDECREF(conv_funcs[i]);
+ }
+ PyMem_FREE(conv_funcs);
+ return NULL;
+}
+
+/**
+ * Read a file into the provided array, or create (and possibly grow) an
+ * array to read into.
+ *
+ * @param s The stream object/struct providing reading capabilities used by
+ * the tokenizer.
+ * @param max_rows The number of rows to read, or -1. If negative
+ * all rows are read.
+ * @param num_field_types The number of field types stored in `field_types`.
+ * @param field_types Information about the dtype for each column (or one if
+ * `homogeneous`).
+ * @param pconfig Pointer to the parser config object used by both the
+ * tokenizer and the conversion functions.
+ * @param num_usecols The number of columns in `usecols`.
+ * @param usecols An array of length `num_usecols` or NULL. If given indicates
+ * which column is read for each individual row (negative columns are
+ * accepted).
+ * @param skiplines The number of lines to skip, these lines are ignored.
+ * @param converters Python dictionary of converters. Finalizing converters
+ * is difficult without information about the number of columns.
+ * @param data_array An array to be filled or NULL. In either case a new
+ * reference is returned (the reference to `data_array` is not stolen).
+ * @param out_descr The dtype used for allocating a new array. This is not
+ * used if `data_array` is provided. Note that the actual dtype of the
+ * returned array can differ for strings.
+ * @param num_cols Pointer in which the actual (discovered) number of columns
+ * is returned. This is only relevant if `homogeneous` is true.
+ * @param homogeneous Whether the datatype of the array is not homogeneous,
+ * i.e. not structured. In this case the number of columns has to be
+ * discovered an the returned array will be 2-dimensional rather than
+ * 1-dimensional.
+ *
+ * @returns Returns the result as an array object or NULL on error. The result
+ * is always a new reference (even when `data_array` was passed in).
+ */
+NPY_NO_EXPORT PyArrayObject *
+read_rows(stream *s,
+ npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types,
+ parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
+ Py_ssize_t skiplines, PyObject *converters,
+ PyArrayObject *data_array, PyArray_Descr *out_descr,
+ bool homogeneous)
+{
+ char *data_ptr = NULL;
+ Py_ssize_t current_num_fields;
+ npy_intp row_size = out_descr->elsize;
+ PyObject **conv_funcs = NULL;
+
+ bool needs_init = PyDataType_FLAGCHK(out_descr, NPY_NEEDS_INIT);
+
+ int ndim = homogeneous ? 2 : 1;
+ npy_intp result_shape[2] = {0, 1};
+
+ bool data_array_allocated = data_array == NULL;
+ /* Make sure we own `data_array` for the purpose of error handling */
+ Py_XINCREF(data_array);
+ size_t rows_per_block = 1; /* will be increased depending on row size */
+ npy_intp data_allocated_rows = 0;
+
+ /* We give a warning if max_rows is used and an empty line is encountered */
+ bool give_empty_row_warning = max_rows >= 0;
+
+ int ts_result = 0;
+ tokenizer_state ts;
+ if (tokenizer_init(&ts, pconfig) < 0) {
+ goto error;
+ }
+
+ /* Set the actual number of fields if it is already known, otherwise -1 */
+ Py_ssize_t actual_num_fields = -1;
+ if (usecols != NULL) {
+ assert(homogeneous || num_field_types == num_usecols);
+ actual_num_fields = num_usecols;
+ }
+ else if (!homogeneous) {
+ assert(usecols == NULL || num_field_types == num_usecols);
+ actual_num_fields = num_field_types;
+ }
+
+ for (Py_ssize_t i = 0; i < skiplines; i++) {
+ ts.state = TOKENIZE_GOTO_LINE_END;
+ ts_result = tokenize(s, &ts, pconfig);
+ if (ts_result < 0) {
+ goto error;
+ }
+ else if (ts_result != 0) {
+ /* Fewer lines than skiplines is acceptable */
+ break;
+ }
+ }
+
+ Py_ssize_t row_count = 0; /* number of rows actually processed */
+ while ((max_rows < 0 || row_count < max_rows) && ts_result == 0) {
+ ts_result = tokenize(s, &ts, pconfig);
+ if (ts_result < 0) {
+ goto error;
+ }
+ current_num_fields = ts.num_fields;
+ field_info *fields = ts.fields;
+ if (NPY_UNLIKELY(ts.num_fields == 0)) {
+ /*
+ * Deprecated NumPy 1.23, 2021-01-13 (not really a deprecation,
+ * but similar policy should apply to removing the warning again)
+ */
+ /* Tokenizer may give a final "empty line" even if there is none */
+ if (give_empty_row_warning && ts_result == 0) {
+ give_empty_row_warning = false;
+ if (PyErr_WarnFormat(PyExc_UserWarning, 3,
+ "Input line %zd contained no data and will not be "
+ "counted towards `max_rows=%zd`. This differs from "
+ "the behaviour in NumPy <=1.22 which counted lines "
+ "rather than rows. If desired, the previous behaviour "
+ "can be achieved by using `itertools.islice`.\n"
+ "Please see the 1.23 release notes for an example on "
+ "how to do this. If you wish to ignore this warning, "
+ "use `warnings.filterwarnings`. This warning is "
+ "expected to be removed in the future and is given "
+ "only once per `loadtxt` call.",
+ row_count + skiplines + 1, max_rows) < 0) {
+ goto error;
+ }
+ }
+ continue; /* Ignore empty line */
+ }
+
+ if (NPY_UNLIKELY(data_ptr == NULL)) {
+ // We've deferred some of the initialization tasks to here,
+ // because we've now read the first line, and we definitively
+ // know how many fields (i.e. columns) we will be processing.
+ if (actual_num_fields == -1) {
+ actual_num_fields = current_num_fields;
+ }
+
+ if (converters != Py_None) {
+ conv_funcs = create_conv_funcs(
+ converters, actual_num_fields, usecols);
+ if (conv_funcs == NULL) {
+ goto error;
+ }
+ }
+
+ /* Note that result_shape[1] is only used if homogeneous is true */
+ result_shape[1] = actual_num_fields;
+ if (homogeneous) {
+ row_size *= actual_num_fields;
+ }
+
+ if (data_array == NULL) {
+ if (max_rows < 0) {
+ /*
+ * Negative max_rows denotes to read the whole file, we
+ * approach this by allocating ever larger blocks.
+ * Adds a number of rows based on `MIN_BLOCK_SIZE`.
+ * Note: later code grows assuming this is a power of two.
+ */
+ if (row_size == 0) {
+ /* actual rows_per_block should not matter here */
+ rows_per_block = 512;
+ }
+ else {
+ /* safe on overflow since min_rows will be 0 or 1 */
+ size_t min_rows = (
+ (MIN_BLOCK_SIZE + row_size - 1) / row_size);
+ while (rows_per_block < min_rows) {
+ rows_per_block *= 2;
+ }
+ }
+ data_allocated_rows = rows_per_block;
+ }
+ else {
+ data_allocated_rows = max_rows;
+ }
+ result_shape[0] = data_allocated_rows;
+ Py_INCREF(out_descr);
+ /*
+ * We do not use Empty, as it would fill with None
+ * and requiring decref'ing if we shrink again.
+ */
+ data_array = (PyArrayObject *)PyArray_SimpleNewFromDescr(
+ ndim, result_shape, out_descr);
+#ifdef NPY_RELAXED_STRIDES_DEBUG
+ /* Incompatible with NPY_RELAXED_STRIDES_DEBUG due to growing */
+ if (result_shape[0] == 1) {
+ PyArray_STRIDES(data_array)[0] = row_size;
+ }
+#endif /* NPY_RELAXED_STRIDES_DEBUG */
+ if (data_array == NULL) {
+ goto error;
+ }
+ if (needs_init) {
+ memset(PyArray_BYTES(data_array), 0, PyArray_NBYTES(data_array));
+ }
+ }
+ else {
+ assert(max_rows >=0);
+ data_allocated_rows = max_rows;
+ }
+ data_ptr = PyArray_BYTES(data_array);
+ }
+
+ if (!usecols && (actual_num_fields != current_num_fields)) {
+ PyErr_Format(PyExc_ValueError,
+ "the number of columns changed from %d to %d at row %zu; "
+ "use `usecols` to select a subset and avoid this error",
+ actual_num_fields, current_num_fields, row_count+1);
+ goto error;
+ }
+
+ if (NPY_UNLIKELY(data_allocated_rows == row_count)) {
+ /*
+ * Grow by ~25% and rounded up to the next rows_per_block
+ * NOTE: This is based on very crude timings and could be refined!
+ */
+ npy_intp new_rows = data_allocated_rows;
+ npy_intp alloc_size = grow_size_and_multiply(
+ &new_rows, rows_per_block, row_size);
+ if (alloc_size < 0) {
+ /* should normally error much earlier, but make sure */
+ PyErr_SetString(PyExc_ValueError,
+ "array is too big. Cannot read file as a single array; "
+ "providing a maximum number of rows to read may help.");
+ goto error;
+ }
+
+ char *new_data = PyDataMem_UserRENEW(
+ PyArray_BYTES(data_array), alloc_size ? alloc_size : 1,
+ PyArray_HANDLER(data_array));
+ if (new_data == NULL) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ /* Replace the arrays data since it may have changed */
+ ((PyArrayObject_fields *)data_array)->data = new_data;
+ ((PyArrayObject_fields *)data_array)->dimensions[0] = new_rows;
+ data_ptr = new_data + row_count * row_size;
+ data_allocated_rows = new_rows;
+ if (needs_init) {
+ memset(data_ptr, '\0', (new_rows - row_count) * row_size);
+ }
+ }
+
+ for (Py_ssize_t i = 0; i < actual_num_fields; ++i) {
+ Py_ssize_t f; /* The field, either 0 (if homogeneous) or i. */
+ Py_ssize_t col; /* The column as read, remapped by usecols */
+ char *item_ptr;
+ if (homogeneous) {
+ f = 0;
+ item_ptr = data_ptr + i * field_types[0].descr->elsize;
+ }
+ else {
+ f = i;
+ item_ptr = data_ptr + field_types[f].structured_offset;
+ }
+
+ if (usecols == NULL) {
+ col = i;
+ }
+ else {
+ col = usecols[i];
+ if (col < 0) {
+ // Python-like column indexing: k = -1 means the last column.
+ col += current_num_fields;
+ }
+ if (NPY_UNLIKELY((col < 0) || (col >= current_num_fields))) {
+ PyErr_Format(PyExc_ValueError,
+ "invalid column index %d at row %zu with %d "
+ "columns",
+ usecols[i], current_num_fields, row_count+1);
+ goto error;
+ }
+ }
+
+ /*
+ * The following function calls represent the main "conversion"
+ * step, i.e. parsing the unicode string for each field and storing
+ * the result in the array.
+ */
+ int parser_res;
+ Py_UCS4 *str = ts.field_buffer + fields[col].offset;
+ Py_UCS4 *end = ts.field_buffer + fields[col + 1].offset - 1;
+ if (conv_funcs == NULL || conv_funcs[i] == NULL) {
+ parser_res = field_types[f].set_from_ucs4(field_types[f].descr,
+ str, end, item_ptr, pconfig);
+ }
+ else {
+ parser_res = to_generic_with_converter(field_types[f].descr,
+ str, end, item_ptr, pconfig, conv_funcs[i]);
+ }
+
+ if (NPY_UNLIKELY(parser_res < 0)) {
+ PyObject *exc, *val, *tb;
+ PyErr_Fetch(&exc, &val, &tb);
+
+ size_t length = end - str;
+ PyObject *string = PyUnicode_FromKindAndData(
+ PyUnicode_4BYTE_KIND, str, length);
+ if (string == NULL) {
+ npy_PyErr_ChainExceptions(exc, val, tb);
+ goto error;
+ }
+ PyErr_Format(PyExc_ValueError,
+ "could not convert string %.100R to %S at "
+ "row %zu, column %d.",
+ string, field_types[f].descr, row_count, col+1);
+ Py_DECREF(string);
+ npy_PyErr_ChainExceptionsCause(exc, val, tb);
+ goto error;
+ }
+ }
+
+ ++row_count;
+ data_ptr += row_size;
+ }
+
+ tokenizer_clear(&ts);
+ PyMem_FREE(conv_funcs);
+
+ if (data_array == NULL) {
+ assert(row_count == 0 && result_shape[0] == 0);
+ if (actual_num_fields == -1) {
+ /*
+ * We found no rows and have to discover the number of elements
+ * we have no choice but to guess 1.
+ * NOTE: It may make sense to move this outside of here to refine
+ * the behaviour where necessary.
+ */
+ result_shape[1] = 1;
+ }
+ else {
+ result_shape[1] = actual_num_fields;
+ }
+ Py_INCREF(out_descr);
+ data_array = (PyArrayObject *)PyArray_Empty(
+ ndim, result_shape, out_descr, 0);
+ }
+
+ /*
+ * Note that if there is no data, `data_array` may still be NULL and
+ * row_count is 0. In that case, always realloc just in case.
+ */
+ if (data_array_allocated && data_allocated_rows != row_count) {
+ size_t size = row_count * row_size;
+ char *new_data = PyDataMem_UserRENEW(
+ PyArray_BYTES(data_array), size ? size : 1,
+ PyArray_HANDLER(data_array));
+ if (new_data == NULL) {
+ Py_DECREF(data_array);
+ PyErr_NoMemory();
+ return NULL;
+ }
+ ((PyArrayObject_fields *)data_array)->data = new_data;
+ ((PyArrayObject_fields *)data_array)->dimensions[0] = row_count;
+ }
+
+ return data_array;
+
+ error:
+ PyMem_FREE(conv_funcs);
+ tokenizer_clear(&ts);
+ Py_XDECREF(data_array);
+ return NULL;
+}
diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h
new file mode 100644
index 000000000..20eb9e186
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/rows.h
@@ -0,0 +1,22 @@
+
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <stdio.h>
+
+#include "textreading/stream.h"
+#include "textreading/field_types.h"
+#include "textreading/parser_config.h"
+
+
+NPY_NO_EXPORT PyArrayObject *
+read_rows(stream *s,
+ npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types,
+ parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
+ Py_ssize_t skiplines, PyObject *converters,
+ PyArrayObject *data_array, PyArray_Descr *out_descr,
+ bool homogeneous);
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ */
diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c
new file mode 100644
index 000000000..11b03e31c
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/str_to_int.c
@@ -0,0 +1,67 @@
+
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "lowlevel_strided_loops.h"
+
+#include <string.h>
+#include "textreading/str_to_int.h"
+#include "textreading/parser_config.h"
+
+
+#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX, byteswap_unaligned) \
+ NPY_NO_EXPORT int \
+ to_##intw(PyArray_Descr *descr, \
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \
+ parser_config *pconfig) \
+ { \
+ int64_t parsed; \
+ intw##_t x; \
+ \
+ if (str_to_int64(str, end, INT_MIN, INT_MAX, &parsed) < 0) { \
+ return -1; \
+ } \
+ else { \
+ x = (intw##_t)parsed; \
+ } \
+ memcpy(dataptr, &x, sizeof(x)); \
+ if (!PyArray_ISNBO(descr->byteorder)) { \
+ byteswap_unaligned(dataptr); \
+ } \
+ return 0; \
+ }
+
+#define DECLARE_TO_UINT(uintw, UINT_MAX, byteswap_unaligned) \
+ NPY_NO_EXPORT int \
+ to_##uintw(PyArray_Descr *descr, \
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \
+ parser_config *pconfig) \
+ { \
+ uint64_t parsed; \
+ uintw##_t x; \
+ \
+ if (str_to_uint64(str, end, UINT_MAX, &parsed) < 0) { \
+ return -1; \
+ } \
+ else { \
+ x = (uintw##_t)parsed; \
+ } \
+ memcpy(dataptr, &x, sizeof(x)); \
+ if (!PyArray_ISNBO(descr->byteorder)) { \
+ byteswap_unaligned(dataptr); \
+ } \
+ return 0; \
+ }
+
+#define byteswap_nothing(ptr)
+
+DECLARE_TO_INT(int8, INT8_MIN, INT8_MAX, byteswap_nothing)
+DECLARE_TO_INT(int16, INT16_MIN, INT16_MAX, npy_bswap2_unaligned)
+DECLARE_TO_INT(int32, INT32_MIN, INT32_MAX, npy_bswap4_unaligned)
+DECLARE_TO_INT(int64, INT64_MIN, INT64_MAX, npy_bswap8_unaligned)
+
+DECLARE_TO_UINT(uint8, UINT8_MAX, byteswap_nothing)
+DECLARE_TO_UINT(uint16, UINT16_MAX, npy_bswap2_unaligned)
+DECLARE_TO_UINT(uint32, UINT32_MAX, npy_bswap4_unaligned)
+DECLARE_TO_UINT(uint64, UINT64_MAX, npy_bswap8_unaligned)
diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h
new file mode 100644
index 000000000..a0a89a0ef
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/str_to_int.h
@@ -0,0 +1,174 @@
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/ndarraytypes.h"
+
+#include "textreading/parser_config.h"
+
+
+/*
+ * The following two string conversion functions are largely equivalent
+ * in Pandas. They are in the header file here, to ensure they can be easily
+ * inline in the other function.
+ * Unlike pandas, pass in end-pointer (do not rely on \0) and return 0 or -1.
+ *
+ * The actual functions are defined using macro templating below.
+ */
+NPY_FINLINE int
+str_to_int64(
+ const Py_UCS4 *p_item, const Py_UCS4 *p_end,
+ int64_t int_min, int64_t int_max, int64_t *result)
+{
+ const Py_UCS4 *p = (const Py_UCS4 *)p_item;
+ bool isneg = 0;
+ int64_t number = 0;
+
+ // Skip leading spaces.
+ while (Py_UNICODE_ISSPACE(*p)) {
+ ++p;
+ }
+
+ // Handle sign.
+ if (*p == '-') {
+ isneg = true;
+ ++p;
+ }
+ else if (*p == '+') {
+ p++;
+ }
+
+ // Check that there is a first digit.
+ if (!isdigit(*p)) {
+ return -1;
+ }
+
+ if (isneg) {
+ // If number is greater than pre_min, at least one more digit
+ // can be processed without overflowing.
+ int dig_pre_min = -(int_min % 10);
+ int64_t pre_min = int_min / 10;
+
+ // Process the digits.
+ int d = *p;
+ while (isdigit(d)) {
+ if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) {
+ number = number * 10 - (d - '0');
+ d = *++p;
+ }
+ else {
+ return -1;
+ }
+ }
+ }
+ else {
+ // If number is less than pre_max, at least one more digit
+ // can be processed without overflowing.
+ int64_t pre_max = int_max / 10;
+ int dig_pre_max = int_max % 10;
+
+ // Process the digits.
+ int d = *p;
+ while (isdigit(d)) {
+ if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+ }
+ else {
+ return -1;
+ }
+ }
+ }
+
+ // Skip trailing spaces.
+ while (Py_UNICODE_ISSPACE(*p)) {
+ ++p;
+ }
+
+ // Did we use up all the characters?
+ if (p != p_end) {
+ return -1;
+ }
+
+ *result = number;
+ return 0;
+}
+
+
+NPY_FINLINE int
+str_to_uint64(
+ const Py_UCS4 *p_item, const Py_UCS4 *p_end,
+ uint64_t uint_max, uint64_t *result)
+{
+ const Py_UCS4 *p = (const Py_UCS4 *)p_item;
+ uint64_t number = 0;
+ int d;
+
+ // Skip leading spaces.
+ while (Py_UNICODE_ISSPACE(*p)) {
+ ++p;
+ }
+
+ // Handle sign.
+ if (*p == '-') {
+ return -1;
+ }
+ if (*p == '+') {
+ p++;
+ }
+
+ // Check that there is a first digit.
+ if (!isdigit(*p)) {
+ return -1;
+ }
+
+ // If number is less than pre_max, at least one more digit
+ // can be processed without overflowing.
+ uint64_t pre_max = uint_max / 10;
+ int dig_pre_max = uint_max % 10;
+
+ // Process the digits.
+ d = *p;
+ while (isdigit(d)) {
+ if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+ }
+ else {
+ return -1;
+ }
+ }
+
+ // Skip trailing spaces.
+ while (Py_UNICODE_ISSPACE(*p)) {
+ ++p;
+ }
+
+ // Did we use up all the characters?
+ if (p != p_end) {
+ return -1;
+ }
+
+ *result = number;
+ return 0;
+}
+
+
+#define DECLARE_TO_INT_PROTOTYPE(intw) \
+ NPY_NO_EXPORT int \
+ to_##intw(PyArray_Descr *descr, \
+ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \
+ parser_config *pconfig);
+
+DECLARE_TO_INT_PROTOTYPE(int8)
+DECLARE_TO_INT_PROTOTYPE(int16)
+DECLARE_TO_INT_PROTOTYPE(int32)
+DECLARE_TO_INT_PROTOTYPE(int64)
+
+DECLARE_TO_INT_PROTOTYPE(uint8)
+DECLARE_TO_INT_PROTOTYPE(uint16)
+DECLARE_TO_INT_PROTOTYPE(uint32)
+DECLARE_TO_INT_PROTOTYPE(uint64)
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ */
diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h
new file mode 100644
index 000000000..59bd14074
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/stream.h
@@ -0,0 +1,41 @@
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_
+
+#include <stdint.h>
+
+/*
+ * When getting the next line, we hope that the buffer provider can already
+ * give some information about the newlines, because for Python iterables
+ * we definitely expect to get line-by-line buffers.
+ *
+ * BUFFER_IS_FILEEND must be returned when the end of the file is reached and
+ * must NOT be returned together with a valid (non-empty) buffer.
+ */
+#define BUFFER_MAY_CONTAIN_NEWLINE 0
+#define BUFFER_IS_LINEND 1
+#define BUFFER_IS_FILEEND 2
+
+/*
+ * Base struct for streams. We currently have two, a chunked reader for
+ * filelikes and a line-by-line for any iterable.
+ * As of writing, the chunked reader was only used for filelikes not already
+ * opened. That is to preserve the amount read in case of an error exactly.
+ * If we drop this, we could read it more often (but not when `max_rows` is
+ * used).
+ *
+ * The "streams" can extend this struct to store their own data (so it is
+ * a very lightweight "object").
+ */
+typedef struct _stream {
+ int (*stream_nextbuf)(void *sdata, char **start, char **end, int *kind);
+ // Note that the first argument to stream_close is the stream pointer
+ // itself, not the stream_data pointer.
+ int (*stream_close)(struct _stream *strm);
+} stream;
+
+
+#define stream_nextbuf(s, start, end, kind) \
+ ((s)->stream_nextbuf((s), start, end, kind))
+#define stream_close(s) ((s)->stream_close((s)))
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ */
diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c
new file mode 100644
index 000000000..6f84ff01d
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c
@@ -0,0 +1,239 @@
+/*
+ * C side structures to provide capabilities to read Python file like objects
+ * in chunks, or iterate through iterables with each result representing a
+ * single line of a file.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+
+#include "textreading/stream.h"
+
+#define READ_CHUNKSIZE 1 << 14
+
+
+typedef struct {
+ stream stream;
+ /* The Python file object being read. */
+ PyObject *file;
+
+ /* The `read` attribute of the file object. */
+ PyObject *read;
+ /* Amount to read each time we call `obj.read()` */
+ PyObject *chunksize;
+
+ /* Python str object holding the line most recently read from the file. */
+ PyObject *chunk;
+
+ /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */
+ const char *encoding;
+} python_chunks_from_file;
+
+
+/*
+ * Helper function to support byte objects as well as unicode strings.
+ *
+ * NOTE: Steals a reference to `str` (although usually returns it unmodified).
+ */
+static NPY_INLINE PyObject *
+process_stringlike(PyObject *str, const char *encoding)
+{
+ if (PyBytes_Check(str)) {
+ PyObject *ustr;
+ ustr = PyUnicode_FromEncodedObject(str, encoding, NULL);
+ if (ustr == NULL) {
+ return NULL;
+ }
+ Py_DECREF(str);
+ return ustr;
+ }
+ else if (!PyUnicode_Check(str)) {
+ PyErr_SetString(PyExc_TypeError,
+ "non-string returned while reading data");
+ Py_DECREF(str);
+ return NULL;
+ }
+ return str;
+}
+
+
+static NPY_INLINE void
+buffer_info_from_unicode(PyObject *str, char **start, char **end, int *kind)
+{
+ Py_ssize_t length = PyUnicode_GET_LENGTH(str);
+ *kind = PyUnicode_KIND(str);
+
+ if (*kind == PyUnicode_1BYTE_KIND) {
+ *start = (char *)PyUnicode_1BYTE_DATA(str);
+ }
+ else if (*kind == PyUnicode_2BYTE_KIND) {
+ *start = (char *)PyUnicode_2BYTE_DATA(str);
+ length *= sizeof(Py_UCS2);
+ }
+ else if (*kind == PyUnicode_4BYTE_KIND) {
+ *start = (char *)PyUnicode_4BYTE_DATA(str);
+ length *= sizeof(Py_UCS4);
+ }
+ *end = *start + length;
+}
+
+
+static int
+fb_nextbuf(python_chunks_from_file *fb, char **start, char **end, int *kind)
+{
+ Py_XDECREF(fb->chunk);
+ fb->chunk = NULL;
+
+ PyObject *chunk = PyObject_CallFunctionObjArgs(fb->read, fb->chunksize, NULL);
+ if (chunk == NULL) {
+ return -1;
+ }
+ fb->chunk = process_stringlike(chunk, fb->encoding);
+ if (fb->chunk == NULL) {
+ return -1;
+ }
+ buffer_info_from_unicode(fb->chunk, start, end, kind);
+ if (*start == *end) {
+ return BUFFER_IS_FILEEND;
+ }
+ return BUFFER_MAY_CONTAIN_NEWLINE;
+}
+
+
+static int
+fb_del(stream *strm)
+{
+ python_chunks_from_file *fb = (python_chunks_from_file *)strm;
+
+ Py_XDECREF(fb->file);
+ Py_XDECREF(fb->read);
+ Py_XDECREF(fb->chunksize);
+ Py_XDECREF(fb->chunk);
+
+ PyMem_FREE(strm);
+
+ return 0;
+}
+
+
+NPY_NO_EXPORT stream *
+stream_python_file(PyObject *obj, const char *encoding)
+{
+ python_chunks_from_file *fb;
+
+ fb = (python_chunks_from_file *)PyMem_Calloc(1, sizeof(python_chunks_from_file));
+ if (fb == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ fb->stream.stream_nextbuf = (void *)&fb_nextbuf;
+ fb->stream.stream_close = &fb_del;
+
+ fb->encoding = encoding;
+ Py_INCREF(obj);
+ fb->file = obj;
+
+ fb->read = PyObject_GetAttrString(obj, "read");
+ if (fb->read == NULL) {
+ goto fail;
+ }
+ fb->chunksize = PyLong_FromLong(READ_CHUNKSIZE);
+ if (fb->chunksize == NULL) {
+ goto fail;
+ }
+
+ return (stream *)fb;
+
+fail:
+ fb_del((stream *)fb);
+ return NULL;
+}
+
+
+/*
+ * Stream from a Python iterable by interpreting each item as a line in a file
+ */
+typedef struct {
+ stream stream;
+ /* The Python file object being read. */
+ PyObject *iterator;
+
+ /* Python str object holding the line most recently fetched */
+ PyObject *line;
+
+ /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */
+ const char *encoding;
+} python_lines_from_iterator;
+
+
+static int
+it_del(stream *strm)
+{
+ python_lines_from_iterator *it = (python_lines_from_iterator *)strm;
+
+ Py_XDECREF(it->iterator);
+ Py_XDECREF(it->line);
+
+ PyMem_FREE(strm);
+ return 0;
+}
+
+
+static int
+it_nextbuf(python_lines_from_iterator *it, char **start, char **end, int *kind)
+{
+ Py_XDECREF(it->line);
+ it->line = NULL;
+
+ PyObject *line = PyIter_Next(it->iterator);
+ if (line == NULL) {
+ if (PyErr_Occurred()) {
+ return -1;
+ }
+ *start = NULL;
+ *end = NULL;
+ return BUFFER_IS_FILEEND;
+ }
+ it->line = process_stringlike(line, it->encoding);
+ if (it->line == NULL) {
+ return -1;
+ }
+
+ buffer_info_from_unicode(it->line, start, end, kind);
+ return BUFFER_IS_LINEND;
+}
+
+
+NPY_NO_EXPORT stream *
+stream_python_iterable(PyObject *obj, const char *encoding)
+{
+ python_lines_from_iterator *it;
+
+ if (!PyIter_Check(obj)) {
+ PyErr_SetString(PyExc_TypeError,
+ "error reading from object, expected an iterable.");
+ return NULL;
+ }
+
+ it = (python_lines_from_iterator *)PyMem_Calloc(1, sizeof(*it));
+ if (it == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ it->stream.stream_nextbuf = (void *)&it_nextbuf;
+ it->stream.stream_close = &it_del;
+
+ it->encoding = encoding;
+ Py_INCREF(obj);
+ it->iterator = obj;
+
+ return (stream *)it;
+}
diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.h b/numpy/core/src/multiarray/textreading/stream_pyobject.h
new file mode 100644
index 000000000..45c11dd95
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/stream_pyobject.h
@@ -0,0 +1,16 @@
+
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include "textreading/stream.h"
+
+NPY_NO_EXPORT stream *
+stream_python_file(PyObject *obj, const char *encoding);
+
+NPY_NO_EXPORT stream *
+stream_python_iterable(PyObject *obj, const char *encoding);
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ */
diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src
new file mode 100644
index 000000000..6ddba3345
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/tokenize.c.src
@@ -0,0 +1,457 @@
+
+#include <Python.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/ndarraytypes.h"
+
+#include "textreading/stream.h"
+#include "textreading/tokenize.h"
+#include "textreading/parser_config.h"
+#include "textreading/growth.h"
+
+
+/*
+ How parsing quoted fields works:
+
+ For quoting to be activated, the first character of the field
+ must be the quote character (after taking into account
+ ignore_leading_spaces). While quoting is active, delimiters
+ are treated as regular characters, not delimiters. Quoting is
+ deactivated by the second occurrence of the quote character. An
+ exception is the occurrence of two consecutive quote characters,
+ which is treated as a literal occurrence of a single quote character.
+ E.g. (with delimiter=',' and quote='"'):
+ 12.3,"New York, NY","3'2"""
+ The second and third fields are `New York, NY` and `3'2"`.
+
+ If a non-delimiter occurs after the closing quote, the quote is
+ ignored and parsing continues with quoting deactivated. Quotes
+ that occur while quoting is not activated are not handled specially;
+ they become part of the data.
+ E.g:
+ 12.3,"ABC"DEF,XY"Z
+ The second and third fields are `ABCDEF` and `XY"Z`.
+
+ Note that the second field of
+ 12.3,"ABC" ,4.5
+ is `ABC `. Currently there is no option to ignore whitespace
+ at the end of a field.
+*/
+
+
+/**begin repeat
+ * #type = Py_UCS1, Py_UCS2, Py_UCS4#
+ */
+static NPY_INLINE int
+copy_to_field_buffer_@type@(tokenizer_state *ts,
+ const @type@ *chunk_start, const @type@ *chunk_end)
+{
+ npy_intp chunk_length = chunk_end - chunk_start;
+ npy_intp size = chunk_length + ts->field_buffer_pos + 2;
+
+ if (NPY_UNLIKELY(ts->field_buffer_length < size)) {
+ npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4));
+ if (alloc_size < 0) {
+ PyErr_Format(PyExc_ValueError,
+ "line too long to handle while reading file.");
+ return -1;
+ }
+ Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size);
+ if (grown == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ ts->field_buffer_length = size;
+ ts->field_buffer = grown;
+ }
+
+ Py_UCS4 *write_pos = ts->field_buffer + ts->field_buffer_pos;
+ for (; chunk_start < chunk_end; chunk_start++, write_pos++) {
+ *write_pos = (Py_UCS4)*chunk_start;
+ }
+ *write_pos = '\0'; /* always ensure we end with NUL */
+ ts->field_buffer_pos += chunk_length;
+ return 0;
+}
+/**end repeat**/
+
+
+static NPY_INLINE int
+add_field(tokenizer_state *ts)
+{
+ /* The previous field is done, advance to keep a NUL byte at the end */
+ ts->field_buffer_pos += 1;
+
+ if (NPY_UNLIKELY(ts->num_fields + 1 > ts->fields_size)) {
+ npy_intp size = ts->num_fields;
+
+ npy_intp alloc_size = grow_size_and_multiply(
+ &size, 4, sizeof(field_info));
+ if (alloc_size < 0) {
+ /* Check for a size overflow, path should be almost impossible. */
+ PyErr_Format(PyExc_ValueError,
+ "too many columns found; cannot read file.");
+ return -1;
+ }
+ field_info *fields = PyMem_Realloc(ts->fields, alloc_size);
+ if (fields == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ ts->fields = fields;
+ ts->fields_size = size;
+ }
+
+ ts->fields[ts->num_fields].offset = ts->field_buffer_pos;
+ ts->fields[ts->num_fields].quoted = false;
+ ts->num_fields += 1;
+ /* Ensure this (currently empty) word is NUL terminated. */
+ ts->field_buffer[ts->field_buffer_pos] = '\0';
+ return 0;
+}
+
+
+/**begin repeat
+ * #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND#
+ * #type = Py_UCS1, Py_UCS2, Py_UCS4#
+ */
+static NPY_INLINE int
+tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
+{
+ @type@ *pos = (@type@ *)ts->pos;
+ @type@ *stop = (@type@ *)ts->end;
+ @type@ *chunk_start;
+
+ if (ts->state == TOKENIZE_CHECK_QUOTED) {
+ /* before we can check for quotes, strip leading whitespace */
+ if (config->ignore_leading_whitespace) {
+ while (pos < stop && Py_UNICODE_ISSPACE(*pos) &&
+ *pos != '\r' && *pos != '\n') {
+ pos++;
+ }
+ if (pos == stop) {
+ ts->pos = (char *)pos;
+ return 0;
+ }
+ }
+
+ /* Setting chunk effectively starts the field */
+ if (*pos == config->quote) {
+ ts->fields[ts->num_fields - 1].quoted = true;
+ ts->state = TOKENIZE_QUOTED;
+ pos++; /* TOKENIZE_QUOTED is OK with pos == stop */
+ }
+ else {
+ /* Set to TOKENIZE_QUOTED or TOKENIZE_QUOTED_WHITESPACE */
+ ts->state = ts->unquoted_state;
+ }
+ }
+
+ switch (ts->state) {
+ case TOKENIZE_UNQUOTED:
+ chunk_start = pos;
+ for (; pos < stop; pos++) {
+ if (*pos == '\r') {
+ ts->state = TOKENIZE_EAT_CRLF;
+ break;
+ }
+ else if (*pos == '\n') {
+ ts->state = TOKENIZE_LINE_END;
+ break;
+ }
+ else if (*pos == config->delimiter) {
+ ts->state = TOKENIZE_INIT;
+ break;
+ }
+ else if (*pos == config->comment) {
+ ts->state = TOKENIZE_GOTO_LINE_END;
+ break;
+ }
+ }
+ if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+ return -1;
+ }
+ pos++;
+ break;
+
+ case TOKENIZE_UNQUOTED_WHITESPACE:
+ /* Note, this branch is largely identical to `TOKENIZE_UNQUOTED` */
+ chunk_start = pos;
+ for (; pos < stop; pos++) {
+ if (*pos == '\r') {
+ ts->state = TOKENIZE_EAT_CRLF;
+ break;
+ }
+ else if (*pos == '\n') {
+ ts->state = TOKENIZE_LINE_END;
+ break;
+ }
+ else if (Py_UNICODE_ISSPACE(*pos)) {
+ ts->state = TOKENIZE_INIT;
+ break;
+ }
+ else if (*pos == config->comment) {
+ ts->state = TOKENIZE_GOTO_LINE_END;
+ break;
+ }
+ }
+ if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+ return -1;
+ }
+ pos++;
+ break;
+
+ case TOKENIZE_QUOTED:
+ chunk_start = pos;
+ for (; pos < stop; pos++) {
+ if (*pos == config->quote) {
+ ts->state = TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE;
+ break;
+ }
+ }
+ if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+ return -1;
+ }
+ pos++;
+ break;
+
+ case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE:
+ if (*pos == config->quote) {
+ /* Copy the quote character directly from the config: */
+ if (copy_to_field_buffer_Py_UCS4(ts,
+ &config->quote, &config->quote+1) < 0) {
+ return -1;
+ }
+ ts->state = TOKENIZE_QUOTED;
+ pos++;
+ }
+ else {
+ /* continue parsing as if unquoted */
+ ts->state = TOKENIZE_UNQUOTED;
+ }
+ break;
+
+ case TOKENIZE_GOTO_LINE_END:
+ if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) {
+ pos = stop; /* advance to next buffer */
+ ts->state = TOKENIZE_LINE_END;
+ break;
+ }
+ for (; pos < stop; pos++) {
+ if (*pos == '\r') {
+ ts->state = TOKENIZE_EAT_CRLF;
+ break;
+ }
+ else if (*pos == '\n') {
+ ts->state = TOKENIZE_LINE_END;
+ break;
+ }
+ }
+ pos++;
+ break;
+
+ case TOKENIZE_EAT_CRLF:
+ /* "Universal newline" support: remove \n in \r\n. */
+ if (*pos == '\n') {
+ pos++;
+ }
+ ts->state = TOKENIZE_LINE_END;
+ break;
+
+ default:
+ assert(0);
+ }
+
+ ts->pos = (char *)pos;
+ return 0;
+}
+/**end repeat**/
+
+
+/*
+ * This tokenizer always copies the full "row" (all tokens). This makes
+ * two things easier:
+ * 1. It means that every word is guaranteed to be followed by a NUL character
+ * (although it can include one as well).
+ * 2. If usecols are used we can sniff the first row easier by parsing it
+ * fully. Further, usecols can be negative so we may not know which row we
+ * need up-front.
+ *
+ * The tokenizer could grow the ability to skip fields and check the
+ * maximum number of fields when known, it is unclear that this is worthwhile.
+ *
+ * Unlike some tokenizers, this one tries to work in chunks and copies
+ * data in chunks as well. The hope is that this makes multiple light-weight
+ * loops rather than a single heavy one, to allow e.g. quickly scanning for the
+ * end of a field. Copying chunks also means we usually only check once per
+ * field whether the buffer is large enough.
+ * Different choices are possible, this one seems to work well, though.
+ *
+ * The core (main part) of the tokenizer is specialized for the three Python
+ * unicode flavors UCS1, UCS2, and UCS4 as a worthwhile optimization.
+ */
+NPY_NO_EXPORT int
+tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
+{
+ assert(ts->fields_size >= 2);
+ assert(ts->field_buffer_length >= 2*sizeof(Py_UCS4));
+
+ int finished_reading_file = 0;
+
+ /* Reset to start of buffer */
+ ts->field_buffer_pos = 0;
+ ts->num_fields = 0;
+
+ while (1) {
+ /*
+ * This loop adds new fields to the result (to make up a full row)
+ * until the row ends (typically a line end or the file end)
+ */
+ if (ts->state == TOKENIZE_INIT) {
+ /* Start a new field */
+ if (add_field(ts) < 0) {
+ return -1;
+ }
+ ts->state = TOKENIZE_CHECK_QUOTED;
+ }
+
+ if (NPY_UNLIKELY(ts->pos >= ts->end)) {
+ if (ts->buf_state == BUFFER_IS_LINEND &&
+ ts->state != TOKENIZE_QUOTED) {
+ /*
+ * Finished line, do not read anymore (also do not eat \n).
+ * If we are in a quoted field and the "line" does not end with
+ * a newline, the quoted field will not have it either.
+ * I.e. `np.loadtxt(['"a', 'b"'], dtype="S2", quotechar='"')`
+ * reads "ab". This matches `next(csv.reader(['"a', 'b"']))`.
+ */
+ break;
+ }
+ /* fetch new data */
+ ts->buf_state = stream_nextbuf(s,
+ &ts->pos, &ts->end, &ts->unicode_kind);
+ if (ts->buf_state < 0) {
+ return -1;
+ }
+ if (ts->buf_state == BUFFER_IS_FILEEND) {
+ finished_reading_file = 1;
+ ts->pos = ts->end; /* stream should ensure this. */
+ break;
+ }
+ else if (ts->pos == ts->end) {
+ /* This must be an empty line (and it must be indicated!). */
+ assert(ts->buf_state == BUFFER_IS_LINEND);
+ break;
+ }
+ }
+ int status;
+ if (ts->unicode_kind == PyUnicode_1BYTE_KIND) {
+ status = tokenizer_core_Py_UCS1(ts, config);
+ }
+ else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) {
+ status = tokenizer_core_Py_UCS2(ts, config);
+ }
+ else {
+ assert(ts->unicode_kind == PyUnicode_4BYTE_KIND);
+ status = tokenizer_core_Py_UCS4(ts, config);
+ }
+ if (status < 0) {
+ return -1;
+ }
+
+ if (ts->state == TOKENIZE_LINE_END) {
+ break;
+ }
+ }
+
+ /*
+ * We have finished tokenizing a full row into fields, finalize result
+ */
+ if (ts->buf_state == BUFFER_IS_LINEND) {
+ /* This line is "finished", make sure we don't touch it again: */
+ ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE;
+ if (NPY_UNLIKELY(ts->pos < ts->end)) {
+ PyErr_SetString(PyExc_ValueError,
+ "Found an unquoted embedded newline within a single line of "
+ "input. This is currently not supported.");
+ return -1;
+ }
+ }
+
+ /* Finish the last field (we "append" one to store the last ones length) */
+ if (add_field(ts) < 0) {
+ return -1;
+ }
+ ts->num_fields -= 1;
+
+ /*
+ * If have one field, but that field is completely empty, this is an
+ * empty line, and we just ignore it.
+ */
+ if (ts->num_fields == 1
+ && ts->fields[1].offset - ts->fields[0].offset == 1
+ && !ts->fields->quoted) {
+ ts->num_fields--;
+ }
+ ts->state = TOKENIZE_INIT;
+ return finished_reading_file;
+}
+
+
+NPY_NO_EXPORT void
+tokenizer_clear(tokenizer_state *ts)
+{
+ PyMem_FREE(ts->field_buffer);
+ ts->field_buffer = NULL;
+ ts->field_buffer_length = 0;
+
+ PyMem_FREE(ts->fields);
+ ts->fields = NULL;
+ ts->fields_size = 0;
+}
+
+
+/*
+ * Initialize the tokenizer. We may want to copy all important config
+ * variables into the tokenizer. This would improve the cache locality during
+ * tokenizing.
+ */
+NPY_NO_EXPORT int
+tokenizer_init(tokenizer_state *ts, parser_config *config)
+{
+ /* State and buf_state could be moved into tokenize if we go by row */
+ ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE;
+ ts->state = TOKENIZE_INIT;
+ if (config->delimiter_is_whitespace) {
+ ts->unquoted_state = TOKENIZE_UNQUOTED_WHITESPACE;
+ }
+ else {
+ ts->unquoted_state = TOKENIZE_UNQUOTED;
+ }
+ ts->num_fields = 0;
+
+ ts->buf_state = 0;
+ ts->pos = NULL;
+ ts->end = NULL;
+
+ ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4));
+ if (ts->field_buffer == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ ts->field_buffer_length = 32;
+
+ ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields));
+ if (ts->fields == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ ts->fields_size = 4;
+ return 0;
+}
diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h
new file mode 100644
index 000000000..fa10bb9b0
--- /dev/null
+++ b/numpy/core/src/multiarray/textreading/tokenize.h
@@ -0,0 +1,78 @@
+
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_
+
+#include <Python.h>
+#include "numpy/ndarraytypes.h"
+
+#include "textreading/stream.h"
+#include "textreading/parser_config.h"
+
+
+typedef enum {
+ /* Initialization of fields */
+ TOKENIZE_INIT,
+ TOKENIZE_CHECK_QUOTED,
+ /* Main field parsing states */
+ TOKENIZE_UNQUOTED,
+ TOKENIZE_UNQUOTED_WHITESPACE,
+ TOKENIZE_QUOTED,
+ /* Handling of two character control sequences (except "\r\n") */
+ TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE,
+ /* Line end handling */
+ TOKENIZE_LINE_END,
+ TOKENIZE_EAT_CRLF, /* "\r\n" support (carriage return, line feed) */
+ TOKENIZE_GOTO_LINE_END,
+} tokenizer_parsing_state;
+
+
+typedef struct {
+ size_t offset;
+ bool quoted;
+} field_info;
+
+
+typedef struct {
+ tokenizer_parsing_state state;
+ /* Either TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE: */
+ tokenizer_parsing_state unquoted_state;
+ int unicode_kind;
+ int buf_state;
+ /* the buffer we are currently working on */
+ char *pos;
+ char *end;
+ /*
+ * Space to copy words into. The buffer must always be at least two NUL
+ * entries longer (8 bytes) than the actual word (including initially).
+ * The first byte beyond the current word is always NUL'ed on write, the
+ * second byte is there to allow easy appending of an additional empty
+ * word at the end (this word is also NUL terminated).
+ */
+ npy_intp field_buffer_length;
+ npy_intp field_buffer_pos;
+ Py_UCS4 *field_buffer;
+
+ /*
+ * Fields, including information about the field being quoted. This
+ * always includes one "additional" empty field. The length of a field
+ * is equal to `fields[i+1].offset - fields[i].offset - 1`.
+ *
+ * The tokenizer assumes at least one field is allocated.
+ */
+ npy_intp num_fields;
+ npy_intp fields_size;
+ field_info *fields;
+} tokenizer_state;
+
+
+NPY_NO_EXPORT void
+tokenizer_clear(tokenizer_state *ts);
+
+
+NPY_NO_EXPORT int
+tokenizer_init(tokenizer_state *ts, parser_config *config);
+
+NPY_NO_EXPORT int
+tokenize(stream *s, tokenizer_state *ts, parser_config *const config);
+
+#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ */
diff --git a/numpy/core/src/npysort/quicksort.c.src b/numpy/core/src/npysort/quicksort.c.src
index 933f75808..b4b060720 100644
--- a/numpy/core/src/npysort/quicksort.c.src
+++ b/numpy/core/src/npysort/quicksort.c.src
@@ -51,8 +51,14 @@
#include "npy_sort.h"
#include "npysort_common.h"
+#include "npy_cpu_features.h"
+#include "x86-qsort.h"
#include <stdlib.h>
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "x86-qsort.dispatch.h"
+#endif
+
#define NOT_USED NPY_UNUSED(unused)
/*
* pushing largest partition has upper bound of log2(n) space
@@ -83,11 +89,22 @@
* npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
* npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat,
* npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
+ * #AVX512 = 0*5, 1, 1, 0*5, 1, 0*7#
*/
NPY_NO_EXPORT int
quicksort_@suff@(void *start, npy_intp num, void *NOT_USED)
{
+
+#if @AVX512@
+ void (*dispfunc)(void*, npy_intp) = NULL;
+ NPY_CPU_DISPATCH_CALL_XB(dispfunc = &x86_quicksort_@suff@);
+ if (dispfunc) {
+ (*dispfunc)(start, num);
+ return 0;
+ }
+#endif
+
@type@ vp;
@type@ *pl = start;
@type@ *pr = pl + num - 1;
diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.c.src b/numpy/core/src/npysort/x86-qsort.dispatch.c.src
new file mode 100644
index 000000000..b93c737cb
--- /dev/null
+++ b/numpy/core/src/npysort/x86-qsort.dispatch.c.src
@@ -0,0 +1,587 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_skx
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "x86-qsort.h"
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#ifdef NPY_HAVE_AVX512_SKX
+#include <immintrin.h>
+#include "numpy/npy_math.h"
+#include "npy_sort.h"
+#include "simd/simd.h"
+
+
+/*
+ * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are
+ * based on these two research papers:
+ * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types
+ * https://drops.dagstuhl.de/opus/volltexte/2021/13775/
+ * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel Skylake
+ * https://arxiv.org/pdf/1704.08579.pdf
+ *
+ * High level idea: Vectorize the quicksort partitioning using AVX-512
+ * compressstore instructions. The algorithm to pick the pivot is to use median of
+ * 72 elements picked at random. If the array size is < 128, then use
+ * Bitonic sorting network. Good resource for bitonic sorting network:
+ * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
+ *
+ * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340 for
+ * potential problems when converting this code to universal intrinsics framework.
+ */
+
+/*
+ * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
+ * sorting network (see
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
+ */
+#define NETWORK1 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
+#define NETWORK2 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3
+#define NETWORK3 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7
+#define NETWORK4 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2
+#define NETWORK5 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+#define NETWORK6 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4
+#define NETWORK7 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8
+#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF)
+#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32)
+#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32)
+#define SHUFFLE_MASK(a,b,c,d) (a << 6) | (b << 4) | (c << 2) | d
+#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK)
+#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK)
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+/*
+ * Vectorized random number generator xoroshiro128+. Broken into 2 parts:
+ * (1) vnext generates 2 64-bit random integers
+ * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to
+ * the length of the array
+ */
+#define VROTL(x, k) /* rotate each uint64_t value in vector */ \
+ _mm256_or_si256(_mm256_slli_epi64((x),(k)),_mm256_srli_epi64((x),64-(k)))
+
+static NPY_INLINE
+__m256i vnext(__m256i* s0, __m256i* s1) {
+ *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */
+ *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1),
+ _mm256_slli_epi64(*s1, 16));
+ *s1 = VROTL(*s1, 37);
+ return _mm256_add_epi64(*s0, *s1); /* return random vector */
+}
+
+/* transform random numbers to the range between 0 and bound - 1 */
+static NPY_INLINE
+__m256i rnd_epu32(__m256i rnd_vec, __m256i bound) {
+ __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32);
+ __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound);
+ return _mm256_blend_epi32(odd, even, 0b01010101);
+}
+
+/**begin repeat
+ *
+ * #TYPE = INT, UINT, FLOAT#
+ * #type = int, uint, float#
+ * #type_t = npy_int, npy_uint, npy_float#
+ * #zmm_t = __m512i, __m512i, __m512#
+ * #ymm_t = __m256i, __m256i, __m256#
+ * #vsuf1 = epi32, epu32, ps#
+ * #vsuf2 = epi32, epi32, ps#
+ * #vsuf3 = si512, si512, ps#
+ * #vsuf4 = s32, u32, f32#
+ * #CMP_GE_OP = _MM_CMPINT_NLT, _MM_CMPINT_NLT, _CMP_GE_OQ#
+ * #TYPE_MAX_VAL = NPY_MAX_INT32, NPY_MAX_UINT32, NPY_INFINITYF#
+ * #TYPE_MIN_VAL = NPY_MIN_INT32, 0, -NPY_INFINITYF#
+ */
+
+/*
+ * COEX == Compare and Exchange two registers by swapping min and max values
+ */
+#define COEX_ZMM_@vsuf1@(a, b) { \
+ @zmm_t@ temp = a; \
+ a = _mm512_min_@vsuf1@(a,b); \
+ b = _mm512_max_@vsuf1@(temp, b);} \
+
+#define COEX_YMM_@vsuf1@(a, b){ \
+ @ymm_t@ temp = a; \
+ a = _mm256_min_@vsuf1@(a, b); \
+ b = _mm256_max_@vsuf1@(temp, b);} \
+
+static NPY_INLINE
+@zmm_t@ cmp_merge_@vsuf1@(@zmm_t@ in1, @zmm_t@ in2, __mmask16 mask)
+{
+ @zmm_t@ min = _mm512_min_@vsuf1@(in2, in1);
+ @zmm_t@ max = _mm512_max_@vsuf1@(in2, in1);
+ return _mm512_mask_mov_@vsuf2@(min, mask, max); // 0 -> min, 1 -> max
+}
+
+/*
+ * Assumes zmm is random and performs a full sorting network defined in
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
+ */
+static NPY_INLINE
+@zmm_t@ sort_zmm_@vsuf1@(@zmm_t@ zmm)
+{
+ zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA);
+ zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(0,1,2,3)), 0xCCCC);
+ zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA);
+ zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK3),zmm), 0xF0F0);
+ zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(1,0,3,2)), 0xCCCC);
+ zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA);
+ zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5),zmm), 0xFF00);
+ zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK6),zmm), 0xF0F0);
+ zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(1,0,3,2)), 0xCCCC);
+ zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA);
+ return zmm;
+}
+
+// Assumes zmm is bitonic and performs a recursive half cleaner
+static NPY_INLINE
+@zmm_t@ bitonic_merge_zmm_@vsuf1@(@zmm_t@ zmm)
+{
+ // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
+ zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK7),zmm), 0xFF00);
+ // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
+ zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK6),zmm), 0xF0F0);
+ // 3) half_cleaner[4]
+ zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(1,0,3,2)), 0xCCCC);
+ // 3) half_cleaner[1]
+ zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA);
+ return zmm;
+}
+
+// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
+static NPY_INLINE
+void bitonic_merge_two_zmm_@vsuf1@(@zmm_t@* zmm1, @zmm_t@* zmm2)
+{
+ // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
+ *zmm2 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), *zmm2);
+ @zmm_t@ zmm3 = _mm512_min_@vsuf1@(*zmm1, *zmm2);
+ @zmm_t@ zmm4 = _mm512_max_@vsuf1@(*zmm1, *zmm2);
+ // 2) Recursive half cleaner for each
+ *zmm1 = bitonic_merge_zmm_@vsuf1@(zmm3);
+ *zmm2 = bitonic_merge_zmm_@vsuf1@(zmm4);
+}
+
+// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive half cleaner
+static NPY_INLINE
+void bitonic_merge_four_zmm_@vsuf1@(@zmm_t@* zmm)
+{
+ @zmm_t@ zmm2r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[2]);
+ @zmm_t@ zmm3r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[3]);
+ @zmm_t@ zmm_t1 = _mm512_min_@vsuf1@(zmm[0], zmm3r);
+ @zmm_t@ zmm_t2 = _mm512_min_@vsuf1@(zmm[1], zmm2r);
+ @zmm_t@ zmm_t3 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[1], zmm2r));
+ @zmm_t@ zmm_t4 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[0], zmm3r));
+ @zmm_t@ zmm0 = _mm512_min_@vsuf1@(zmm_t1, zmm_t2);
+ @zmm_t@ zmm1 = _mm512_max_@vsuf1@(zmm_t1, zmm_t2);
+ @zmm_t@ zmm2 = _mm512_min_@vsuf1@(zmm_t3, zmm_t4);
+ @zmm_t@ zmm3 = _mm512_max_@vsuf1@(zmm_t3, zmm_t4);
+ zmm[0] = bitonic_merge_zmm_@vsuf1@(zmm0);
+ zmm[1] = bitonic_merge_zmm_@vsuf1@(zmm1);
+ zmm[2] = bitonic_merge_zmm_@vsuf1@(zmm2);
+ zmm[3] = bitonic_merge_zmm_@vsuf1@(zmm3);
+}
+
+static NPY_INLINE
+void bitonic_merge_eight_zmm_@vsuf1@(@zmm_t@* zmm)
+{
+ @zmm_t@ zmm4r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[4]);
+ @zmm_t@ zmm5r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[5]);
+ @zmm_t@ zmm6r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[6]);
+ @zmm_t@ zmm7r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[7]);
+ @zmm_t@ zmm_t1 = _mm512_min_@vsuf1@(zmm[0], zmm7r);
+ @zmm_t@ zmm_t2 = _mm512_min_@vsuf1@(zmm[1], zmm6r);
+ @zmm_t@ zmm_t3 = _mm512_min_@vsuf1@(zmm[2], zmm5r);
+ @zmm_t@ zmm_t4 = _mm512_min_@vsuf1@(zmm[3], zmm4r);
+ @zmm_t@ zmm_t5 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[3], zmm4r));
+ @zmm_t@ zmm_t6 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[2], zmm5r));
+ @zmm_t@ zmm_t7 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[1], zmm6r));
+ @zmm_t@ zmm_t8 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[0], zmm7r));
+ COEX_ZMM_@vsuf1@(zmm_t1, zmm_t3);
+ COEX_ZMM_@vsuf1@(zmm_t2, zmm_t4);
+ COEX_ZMM_@vsuf1@(zmm_t5, zmm_t7);
+ COEX_ZMM_@vsuf1@(zmm_t6, zmm_t8);
+ COEX_ZMM_@vsuf1@(zmm_t1, zmm_t2);
+ COEX_ZMM_@vsuf1@(zmm_t3, zmm_t4);
+ COEX_ZMM_@vsuf1@(zmm_t5, zmm_t6);
+ COEX_ZMM_@vsuf1@(zmm_t7, zmm_t8);
+ zmm[0] = bitonic_merge_zmm_@vsuf1@(zmm_t1);
+ zmm[1] = bitonic_merge_zmm_@vsuf1@(zmm_t2);
+ zmm[2] = bitonic_merge_zmm_@vsuf1@(zmm_t3);
+ zmm[3] = bitonic_merge_zmm_@vsuf1@(zmm_t4);
+ zmm[4] = bitonic_merge_zmm_@vsuf1@(zmm_t5);
+ zmm[5] = bitonic_merge_zmm_@vsuf1@(zmm_t6);
+ zmm[6] = bitonic_merge_zmm_@vsuf1@(zmm_t7);
+ zmm[7] = bitonic_merge_zmm_@vsuf1@(zmm_t8);
+}
+
+static NPY_INLINE
+void sort_16_@vsuf1@(@type_t@* arr, npy_int N)
+{
+ __mmask16 load_mask = (0x0001 << N) - 0x0001;
+ @zmm_t@ zmm = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask, arr);
+ _mm512_mask_storeu_@vsuf2@(arr, load_mask, sort_zmm_@vsuf1@(zmm));
+}
+
+static NPY_INLINE
+void sort_32_@vsuf1@(@type_t@* arr, npy_int N)
+{
+ if (N <= 16) {
+ sort_16_@vsuf1@(arr, N);
+ return;
+ }
+ @zmm_t@ zmm1 = _mm512_loadu_@vsuf3@(arr);
+ __mmask16 load_mask = (0x0001 << (N-16)) - 0x0001;
+ @zmm_t@ zmm2 = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask, arr + 16);
+ zmm1 = sort_zmm_@vsuf1@(zmm1);
+ zmm2 = sort_zmm_@vsuf1@(zmm2);
+ bitonic_merge_two_zmm_@vsuf1@(&zmm1, &zmm2);
+ _mm512_storeu_@vsuf3@(arr, zmm1);
+ _mm512_mask_storeu_@vsuf2@(arr + 16, load_mask, zmm2);
+}
+
+static NPY_INLINE
+void sort_64_@vsuf1@(@type_t@* arr, npy_int N)
+{
+ if (N <= 32) {
+ sort_32_@vsuf1@(arr, N);
+ return;
+ }
+ @zmm_t@ zmm[4];
+ zmm[0] = _mm512_loadu_@vsuf3@(arr);
+ zmm[1] = _mm512_loadu_@vsuf3@(arr + 16);
+ __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
+ if (N < 48) {
+ load_mask1 = (0x0001 << (N-32)) - 0x0001;
+ load_mask2 = 0x0000;
+ }
+ else if (N < 64) {
+ load_mask2 = (0x0001 << (N-48)) - 0x0001;
+ }
+ zmm[2] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask1, arr + 32);
+ zmm[3] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask2, arr + 48);
+ zmm[0] = sort_zmm_@vsuf1@(zmm[0]);
+ zmm[1] = sort_zmm_@vsuf1@(zmm[1]);
+ zmm[2] = sort_zmm_@vsuf1@(zmm[2]);
+ zmm[3] = sort_zmm_@vsuf1@(zmm[3]);
+ bitonic_merge_two_zmm_@vsuf1@(&zmm[0], &zmm[1]);
+ bitonic_merge_two_zmm_@vsuf1@(&zmm[2], &zmm[3]);
+ bitonic_merge_four_zmm_@vsuf1@(zmm);
+ _mm512_storeu_@vsuf3@(arr, zmm[0]);
+ _mm512_storeu_@vsuf3@(arr + 16, zmm[1]);
+ _mm512_mask_storeu_@vsuf2@(arr + 32, load_mask1, zmm[2]);
+ _mm512_mask_storeu_@vsuf2@(arr + 48, load_mask2, zmm[3]);
+}
+
+static NPY_INLINE
+void sort_128_@vsuf1@(@type_t@* arr, npy_int N)
+{
+ if (N <= 64) {
+ sort_64_@vsuf1@(arr, N);
+ return;
+ }
+ @zmm_t@ zmm[8];
+ zmm[0] = _mm512_loadu_@vsuf3@(arr);
+ zmm[1] = _mm512_loadu_@vsuf3@(arr + 16);
+ zmm[2] = _mm512_loadu_@vsuf3@(arr + 32);
+ zmm[3] = _mm512_loadu_@vsuf3@(arr + 48);
+ zmm[0] = sort_zmm_@vsuf1@(zmm[0]);
+ zmm[1] = sort_zmm_@vsuf1@(zmm[1]);
+ zmm[2] = sort_zmm_@vsuf1@(zmm[2]);
+ zmm[3] = sort_zmm_@vsuf1@(zmm[3]);
+ __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
+ __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
+ if (N < 80) {
+ load_mask1 = (0x0001 << (N-64)) - 0x0001;
+ load_mask2 = 0x0000;
+ load_mask3 = 0x0000;
+ load_mask4 = 0x0000;
+ }
+ else if (N < 96) {
+ load_mask2 = (0x0001 << (N-80)) - 0x0001;
+ load_mask3 = 0x0000;
+ load_mask4 = 0x0000;
+ }
+ else if (N < 112) {
+ load_mask3 = (0x0001 << (N-96)) - 0x0001;
+ load_mask4 = 0x0000;
+ }
+ else {
+ load_mask4 = (0x0001 << (N-112)) - 0x0001;
+ }
+ zmm[4] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask1, arr + 64);
+ zmm[5] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask2, arr + 80);
+ zmm[6] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask3, arr + 96);
+ zmm[7] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask4, arr + 112);
+ zmm[4] = sort_zmm_@vsuf1@(zmm[4]);
+ zmm[5] = sort_zmm_@vsuf1@(zmm[5]);
+ zmm[6] = sort_zmm_@vsuf1@(zmm[6]);
+ zmm[7] = sort_zmm_@vsuf1@(zmm[7]);
+ bitonic_merge_two_zmm_@vsuf1@(&zmm[0], &zmm[1]);
+ bitonic_merge_two_zmm_@vsuf1@(&zmm[2], &zmm[3]);
+ bitonic_merge_two_zmm_@vsuf1@(&zmm[4], &zmm[5]);
+ bitonic_merge_two_zmm_@vsuf1@(&zmm[6], &zmm[7]);
+ bitonic_merge_four_zmm_@vsuf1@(zmm);
+ bitonic_merge_four_zmm_@vsuf1@(zmm + 4);
+ bitonic_merge_eight_zmm_@vsuf1@(zmm);
+ _mm512_storeu_@vsuf3@(arr, zmm[0]);
+ _mm512_storeu_@vsuf3@(arr + 16, zmm[1]);
+ _mm512_storeu_@vsuf3@(arr + 32, zmm[2]);
+ _mm512_storeu_@vsuf3@(arr + 48, zmm[3]);
+ _mm512_mask_storeu_@vsuf2@(arr + 64, load_mask1, zmm[4]);
+ _mm512_mask_storeu_@vsuf2@(arr + 80, load_mask2, zmm[5]);
+ _mm512_mask_storeu_@vsuf2@(arr + 96, load_mask3, zmm[6]);
+ _mm512_mask_storeu_@vsuf2@(arr + 112, load_mask4, zmm[7]);
+}
+
+
+static NPY_INLINE
+void swap_@TYPE@(@type_t@ *arr, npy_intp ii, npy_intp jj) {
+ @type_t@ temp = arr[ii];
+ arr[ii] = arr[jj];
+ arr[jj] = temp;
+}
+
+// Median of 3 stratergy
+//static NPY_INLINE
+//npy_intp get_pivot_index(@type_t@ *arr, const npy_intp left, const npy_intp right) {
+// return (rand() % (right + 1 - left)) + left;
+// //npy_intp middle = ((right-left)/2) + left;
+// //@type_t@ a = arr[left], b = arr[middle], c = arr[right];
+// //if ((b >= a && b <= c) || (b <= a && b >= c))
+// // return middle;
+// //if ((a >= b && a <= c) || (a <= b && a >= c))
+// // return left;
+// //else
+// // return right;
+//}
+
+/*
+ * Picking the pivot: Median of 72 array elements chosen at random.
+ */
+
+static NPY_INLINE
+@type_t@ get_pivot_@vsuf1@(@type_t@ *arr, const npy_intp left, const npy_intp right) {
+ /* seeds for vectorized random number generator */
+ __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374,
+ 1324281658759788278, 6214952190349879213);
+ __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653,
+ 7874578921548791257, 1998265912745817298);
+ s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left));
+ s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right));
+
+ npy_intp arrsize = right - left + 1;
+ __m256i bound = _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize);
+ __m512i left_vec = _mm512_set1_epi64(left);
+ __m512i right_vec = _mm512_set1_epi64(right);
+ @ymm_t@ v[9];
+ /* fill 9 vectors with random numbers */
+ for (npy_int i = 0; i < 9; ++i) {
+ __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */
+ __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32(rand_64, bound)); /* random numbers between 0 and bound - 1 */
+ __m512i indices;
+ if (i < 5)
+ indices = _mm512_add_epi64(left_vec, rand_32); /* indices for arr */
+ else
+ indices = _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */
+
+ v[i] = _mm512_i64gather_@vsuf2@(indices, arr, sizeof(@type_t@));
+ }
+
+ /* median network for 9 elements */
+ COEX_YMM_@vsuf1@(v[0], v[1]); COEX_YMM_@vsuf1@(v[2], v[3]);
+ COEX_YMM_@vsuf1@(v[4], v[5]); COEX_YMM_@vsuf1@(v[6], v[7]);
+ COEX_YMM_@vsuf1@(v[0], v[2]); COEX_YMM_@vsuf1@(v[1], v[3]);
+ COEX_YMM_@vsuf1@(v[4], v[6]); COEX_YMM_@vsuf1@(v[5], v[7]);
+ COEX_YMM_@vsuf1@(v[0], v[4]); COEX_YMM_@vsuf1@(v[1], v[2]);
+ COEX_YMM_@vsuf1@(v[5], v[6]); COEX_YMM_@vsuf1@(v[3], v[7]);
+ COEX_YMM_@vsuf1@(v[1], v[5]); COEX_YMM_@vsuf1@(v[2], v[6]);
+ COEX_YMM_@vsuf1@(v[3], v[5]); COEX_YMM_@vsuf1@(v[2], v[4]);
+ COEX_YMM_@vsuf1@(v[3], v[4]);
+ COEX_YMM_@vsuf1@(v[3], v[8]);
+ COEX_YMM_@vsuf1@(v[4], v[8]);
+
+ // technically v[4] needs to be sorted before we pick the correct median,
+ // picking the 4th element works just as well for performance
+ @type_t@* temp = (@type_t@*) &v[4];
+
+ return temp[4];
+}
+
+/*
+ * Parition one ZMM register based on the pivot and returns the index of the
+ * last element that is less than equal to the pivot.
+ */
+static NPY_INLINE
+npy_int partition_vec_@vsuf1@(@type_t@* arr, npy_intp left, npy_intp right,
+ const @zmm_t@ curr_vec, const @zmm_t@ pivot_vec,
+ @zmm_t@* smallest_vec, @zmm_t@* biggest_vec)
+{
+ /* which elements are larger than the pivot */
+ __mmask16 gt_mask = _mm512_cmp_@vsuf1@_mask(curr_vec, pivot_vec, @CMP_GE_OP@);
+ npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask);
+ _mm512_mask_compressstoreu_@vsuf2@(arr + left, _knot_mask16(gt_mask), curr_vec);
+ _mm512_mask_compressstoreu_@vsuf2@(arr + right - amount_gt_pivot, gt_mask, curr_vec);
+ *smallest_vec = _mm512_min_@vsuf1@(curr_vec, *smallest_vec);
+ *biggest_vec = _mm512_max_@vsuf1@(curr_vec, *biggest_vec);
+ return amount_gt_pivot;
+}
+
+/*
+ * Parition an array based on the pivot and returns the index of the
+ * last element that is less than equal to the pivot.
+ */
+static NPY_INLINE
+npy_intp partition_avx512_@vsuf1@(@type_t@* arr, npy_intp left, npy_intp right,
+ @type_t@ pivot, @type_t@* smallest, @type_t@* biggest)
+{
+ /* make array length divisible by 16 , shortening the array */
+ for (npy_int i = (right - left) % 16; i > 0; --i) {
+ *smallest = MIN(*smallest, arr[left]);
+ *biggest = MAX(*biggest, arr[left]);
+ if (arr[left] > pivot) {
+ swap_@TYPE@(arr, left, --right);
+ }
+ else {
+ ++left;
+ }
+ }
+
+ if(left == right)
+ return left; /* less than 16 elements in the array */
+
+ @zmm_t@ pivot_vec = _mm512_set1_@vsuf2@(pivot);
+ @zmm_t@ min_vec = _mm512_set1_@vsuf2@(*smallest);
+ @zmm_t@ max_vec = _mm512_set1_@vsuf2@(*biggest);
+
+ if(right - left == 16) {
+ @zmm_t@ vec = _mm512_loadu_@vsuf3@(arr + left);
+ npy_int amount_gt_pivot = partition_vec_@vsuf1@(arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec);
+ *smallest = npyv_reducemin_@vsuf4@(min_vec);
+ *biggest = npyv_reducemax_@vsuf4@(max_vec);
+ return left + (16 - amount_gt_pivot);
+ }
+
+ // first and last 16 values are partitioned at the end
+ @zmm_t@ vec_left = _mm512_loadu_@vsuf3@(arr + left);
+ @zmm_t@ vec_right = _mm512_loadu_@vsuf3@(arr + (right - 16));
+ // store points of the vectors
+ npy_intp r_store = right - 16;
+ npy_intp l_store = left;
+ // indices for loading the elements
+ left += 16;
+ right -= 16;
+ while(right - left != 0) {
+ @zmm_t@ curr_vec;
+ /*
+ * if fewer elements are stored on the right side of the array,
+ * then next elements are loaded from the right side,
+ * otherwise from the left side
+ */
+ if((r_store + 16) - right < left - l_store) {
+ right -= 16;
+ curr_vec = _mm512_loadu_@vsuf3@(arr + right);
+ }
+ else {
+ curr_vec = _mm512_loadu_@vsuf3@(arr + left);
+ left += 16;
+ }
+ // partition the current vector and save it on both sides of the array
+ npy_int amount_gt_pivot = partition_vec_@vsuf1@(arr, l_store, r_store + 16, curr_vec, pivot_vec, &min_vec, &max_vec);;
+ r_store -= amount_gt_pivot; l_store += (16 - amount_gt_pivot);
+ }
+
+ /* partition and save vec_left and vec_right */
+ npy_int amount_gt_pivot = partition_vec_@vsuf1@(arr, l_store, r_store + 16, vec_left, pivot_vec, &min_vec, &max_vec);
+ l_store += (16 - amount_gt_pivot);
+ amount_gt_pivot = partition_vec_@vsuf1@(arr, l_store, l_store + 16, vec_right, pivot_vec, &min_vec, &max_vec);
+ l_store += (16 - amount_gt_pivot);
+ *smallest = npyv_reducemin_@vsuf4@(min_vec);
+ *biggest = npyv_reducemax_@vsuf4@(max_vec);
+ return l_store;
+}
+
+static NPY_INLINE
+void qsort_@type@(@type_t@* arr, npy_intp left, npy_intp right, npy_int max_iters)
+{
+ /*
+ * Resort to heapsort if quicksort isnt making any progress
+ */
+ if (max_iters <= 0) {
+ heapsort_@type@((void*)(arr + left), right + 1 - left, NULL);
+ return;
+ }
+ /*
+ * Base case: use bitonic networks to sort arrays <= 128
+ */
+ if (right + 1 - left <= 128) {
+ sort_128_@vsuf1@(arr + left, right + 1 -left);
+ return;
+ }
+
+ @type_t@ pivot = get_pivot_@vsuf1@(arr, left, right);
+ @type_t@ smallest = @TYPE_MAX_VAL@;
+ @type_t@ biggest = @TYPE_MIN_VAL@;
+ npy_intp pivot_index = partition_avx512_@vsuf1@(arr, left, right+1, pivot, &smallest, &biggest);
+ if (pivot != smallest)
+ qsort_@type@(arr, left, pivot_index - 1, max_iters - 1);
+ if (pivot != biggest)
+ qsort_@type@(arr, pivot_index, right, max_iters - 1);
+}
+/**end repeat**/
+
+static NPY_INLINE
+npy_intp replace_nan_with_inf(npy_float* arr, npy_intp arrsize)
+{
+ npy_intp nan_count = 0;
+ __mmask16 loadmask = 0xFFFF;
+ while (arrsize > 0) {
+ if (arrsize < 16) {
+ loadmask = (0x0001 << arrsize) - 0x0001;
+ }
+ __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
+ __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
+ nan_count += _mm_popcnt_u32((npy_int) nanmask);
+ _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
+ arr += 16;
+ arrsize -= 16;
+ }
+ return nan_count;
+}
+
+static NPY_INLINE
+void replace_inf_with_nan(npy_float* arr, npy_intp arrsize, npy_intp nan_count)
+{
+ for (npy_intp ii = arrsize-1; nan_count > 0; --ii) {
+ arr[ii] = NPY_NANF;
+ nan_count -= 1;
+ }
+}
+
+/**begin repeat
+ *
+ * #type = int, uint, float#
+ * #type_t = npy_int, npy_uint, npy_float#
+ * #FIXNAN = 0, 0, 1#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(x86_quicksort_@type@)
+(void* arr, npy_intp arrsize)
+{
+ if (arrsize > 1) {
+#if @FIXNAN@
+ npy_intp nan_count = replace_nan_with_inf((@type_t@*) arr, arrsize);
+#endif
+ qsort_@type@((@type_t@*) arr, 0, arrsize-1, 2*log2(arrsize));
+#if @FIXNAN@
+ replace_inf_with_nan((@type_t@*) arr, arrsize, nan_count);
+#endif
+ }
+}
+/**end repeat**/
+
+#endif // NPY_HAVE_AVX512_SKX
diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h
new file mode 100644
index 000000000..8cb8e3654
--- /dev/null
+++ b/numpy/core/src/npysort/x86-qsort.h
@@ -0,0 +1,18 @@
+#include "numpy/npy_common.h"
+#include "npy_cpu_dispatch.h"
+
+#ifndef NPY_NO_EXPORT
+ #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "x86-qsort.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int,
+ (void *start, npy_intp num))
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint,
+ (void *start, npy_intp num))
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float,
+ (void *start, npy_intp num))
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 708e82910..73bb5e2d8 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -9278,3 +9278,52 @@ class TestViewDtype:
[[1284, 1798], [4368, 4882]],
[[2312, 2826], [5396, 5910]]]
assert_array_equal(x.view('<i2'), expected)
+
+
+# Test various array sizes that hit different code paths in quicksort-avx512
+@pytest.mark.parametrize("N", [8, 16, 24, 32, 48, 64, 96, 128, 151, 191,
+ 256, 383, 512, 1023, 2047])
+def test_sort_float(N):
+ # Regular data with nan sprinkled
+ np.random.seed(42)
+ arr = -0.5 + np.random.sample(N).astype('f')
+ arr[np.random.choice(arr.shape[0], 3)] = np.nan
+ assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
+
+ # (2) with +INF
+ infarr = np.inf*np.ones(N, dtype='f')
+ infarr[np.random.choice(infarr.shape[0], 5)] = -1.0
+ assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
+
+ # (3) with -INF
+ neginfarr = -np.inf*np.ones(N, dtype='f')
+ neginfarr[np.random.choice(neginfarr.shape[0], 5)] = 1.0
+ assert_equal(np.sort(neginfarr, kind='quick'),
+ np.sort(neginfarr, kind='heap'))
+
+ # (4) with +/-INF
+ infarr = np.inf*np.ones(N, dtype='f')
+ infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf
+ assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
+
+
+def test_sort_int():
+ # Random data with NPY_MAX_INT32 and NPY_MIN_INT32 sprinkled
+ rng = np.random.default_rng(42)
+ N = 2047
+ minv = np.iinfo(np.int32).min
+ maxv = np.iinfo(np.int32).max
+ arr = rng.integers(low=minv, high=maxv, size=N).astype('int32')
+ arr[np.random.choice(arr.shape[0], 10)] = minv
+ arr[np.random.choice(arr.shape[0], 10)] = maxv
+ assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
+
+
+def test_sort_uint():
+ # Random data with NPY_MAX_UINT32 sprinkled
+ rng = np.random.default_rng(42)
+ N = 2047
+ maxv = np.iinfo(np.uint32).max
+ arr = rng.integers(low=0, high=maxv, size=N).astype('uint32')
+ arr[np.random.choice(arr.shape[0], 10)] = maxv
+ assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index d5b130b72..900538134 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -4411,7 +4411,7 @@ def _check_interpolation_as_method(method, interpolation, fname):
f"the `interpolation=` argument to {fname} was renamed to "
"`method=`, which has additional options.\n"
"Users of the modes 'nearest', 'lower', 'higher', or "
- "'midpoint' are encouraged to review the method they. "
+ "'midpoint' are encouraged to review the method they used. "
"(Deprecated NumPy 1.22)",
DeprecationWarning, stacklevel=4)
if method != "linear":
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 6818ef81d..90424aab4 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -5,6 +5,7 @@ import itertools
import warnings
import weakref
import contextlib
+import operator
from operator import itemgetter, index as opindex, methodcaller
from collections.abc import Mapping
@@ -13,6 +14,7 @@ from . import format
from ._datasource import DataSource
from numpy.core import overrides
from numpy.core.multiarray import packbits, unpackbits
+from numpy.core._multiarray_umath import _load_from_filelike
from numpy.core.overrides import set_array_function_like_doc, set_module
from ._iotools import (
LineSplitter, NameValidator, StringConverter, ConverterError,
@@ -721,101 +723,6 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
zipf.close()
-def _floatconv(x):
- try:
- return float(x) # The fastest path.
- except ValueError:
- if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10.
- try:
- return float.fromhex(x)
- except ValueError:
- pass
- raise # Raise the original exception, which makes more sense.
-
-
-_CONVERTERS = [ # These converters only ever get strs (not bytes) as input.
- (np.bool_, lambda x: bool(int(x))),
- (np.uint64, np.uint64),
- (np.int64, np.int64),
- (np.integer, lambda x: int(float(x))),
- (np.longdouble, np.longdouble),
- (np.floating, _floatconv),
- (complex, lambda x: complex(x.replace('+-', '-'))),
- (np.bytes_, methodcaller('encode', 'latin-1')),
- (np.unicode_, str),
-]
-
-
-def _getconv(dtype):
- """
- Find the correct dtype converter. Adapted from matplotlib.
-
- Even when a lambda is returned, it is defined at the toplevel, to allow
- testing for equality and enabling optimization for single-type data.
- """
- for base, conv in _CONVERTERS:
- if issubclass(dtype.type, base):
- return conv
- return str
-
-
-# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers
-# lifted to the toplevel because recursive inner functions cause either
-# GC-dependent reference loops (because they are closures over loadtxt's
-# internal variables) or large overheads if using a manual trampoline to hide
-# the recursive calls.
-
-
-# not to be confused with the flatten_dtype we import...
-def _loadtxt_flatten_dtype_internal(dt):
- """Unpack a structured data-type, and produce a packer function."""
- if dt.names is None:
- # If the dtype is flattened, return.
- # If the dtype has a shape, the dtype occurs
- # in the list more than once.
- shape = dt.shape
- if len(shape) == 0:
- return ([dt.base], None)
- else:
- packing = [(shape[-1], list)]
- if len(shape) > 1:
- for dim in dt.shape[-2::-1]:
- packing = [(dim*packing[0][0], packing*dim)]
- return ([dt.base] * int(np.prod(dt.shape)),
- functools.partial(_loadtxt_pack_items, packing))
- else:
- types = []
- packing = []
- for field in dt.names:
- tp, bytes = dt.fields[field]
- flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp)
- types.extend(flat_dt)
- flat_packing = flat_packer.args[0] if flat_packer else None
- # Avoid extra nesting for subarrays
- if tp.ndim > 0:
- packing.extend(flat_packing)
- else:
- packing.append((len(flat_dt), flat_packing))
- return (types, functools.partial(_loadtxt_pack_items, packing))
-
-
-def _loadtxt_pack_items(packing, items):
- """Pack items into nested lists based on re-packing info."""
- if packing is None:
- return items[0]
- elif packing is tuple:
- return tuple(items)
- elif packing is list:
- return list(items)
- else:
- start = 0
- ret = []
- for length, subpacking in packing:
- ret.append(
- _loadtxt_pack_items(subpacking, items[start:start+length]))
- start += length
- return tuple(ret)
-
def _ensure_ndmin_ndarray_check_param(ndmin):
"""Just checks if the param ndmin is supported on
_ensure_ndmin_ndarray. It is intended to be used as
@@ -853,17 +760,330 @@ def _ensure_ndmin_ndarray(a, *, ndmin: int):
_loadtxt_chunksize = 50000
-def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None,
- converters=None, skiprows=None, usecols=None, unpack=None,
- ndmin=None, encoding=None, max_rows=None, *, like=None):
+def _loadtxt_dispatcher(
+ fname, dtype=None, comments=None, delimiter=None,
+ converters=None, skiprows=None, usecols=None, unpack=None,
+ ndmin=None, encoding=None, max_rows=None, *, like=None):
return (like,)
+def _check_nonneg_int(value, name="argument"):
+ try:
+ operator.index(value)
+ except TypeError:
+ raise TypeError(f"{name} must be an integer") from None
+ if value < 0:
+ raise ValueError(f"{name} must be nonnegative")
+
+
+def _preprocess_comments(iterable, comments, encoding):
+ """
+ Generator that consumes a line iterated iterable and strips out the
+ multiple (or multi-character) comments from lines.
+ This is a pre-processing step to achieve feature parity with loadtxt
+ (we assume that this feature is a nieche feature).
+ """
+ for line in iterable:
+ if isinstance(line, bytes):
+ # Need to handle conversion here, or the splitting would fail
+ line = line.decode(encoding)
+
+ for c in comments:
+ line = line.split(c, 1)[0]
+
+ yield line
+
+
+# The number of rows we read in one go if confronted with a parametric dtype
+_loadtxt_chunksize = 50000
+
+
+def _read(fname, *, delimiter=',', comment='#', quote='"',
+ imaginary_unit='j', usecols=None, skiplines=0,
+ max_rows=None, converters=None, ndmin=None, unpack=False,
+ dtype=np.float64, encoding="bytes"):
+ r"""
+ Read a NumPy array from a text file.
+
+ Parameters
+ ----------
+ fname : str or file object
+ The filename or the file to be read.
+ delimiter : str, optional
+ Field delimiter of the fields in line of the file.
+ Default is a comma, ','. If None any sequence of whitespace is
+ considered a delimiter.
+ comment : str or sequence of str or None, optional
+ Character that begins a comment. All text from the comment
+ character to the end of the line is ignored.
+ Multiple comments or multiple-character comment strings are supported,
+ but may be slower and `quote` must be empty if used.
+ Use None to disable all use of comments.
+ quote : str or None, optional
+ Character that is used to quote string fields. Default is '"'
+ (a double quote). Use None to disable quote support.
+ imaginary_unit : str, optional
+ Character that represent the imaginay unit `sqrt(-1)`.
+ Default is 'j'.
+ usecols : array_like, optional
+ A one-dimensional array of integer column numbers. These are the
+ columns from the file to be included in the array. If this value
+ is not given, all the columns are used.
+ skiplines : int, optional
+ Number of lines to skip before interpreting the data in the file.
+ max_rows : int, optional
+ Maximum number of rows of data to read. Default is to read the
+ entire file.
+ converters : dict or callable, optional
+ A function to parse all columns strings into the desired value, or
+ a dictionary mapping column number to a parser function.
+ E.g. if column 0 is a date string: ``converters = {0: datestr2num}``.
+ Converters can also be used to provide a default value for missing
+ data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will
+ convert empty fields to 0.
+ Default: None
+ ndmin : int, optional
+ Minimum dimension of the array returned.
+ Allowed values are 0, 1 or 2. Default is 0.
+ unpack : bool, optional
+ If True, the returned array is transposed, so that arguments may be
+ unpacked using ``x, y, z = read(...)``. When used with a structured
+ data-type, arrays are returned for each field. Default is False.
+ dtype : numpy data type
+ A NumPy dtype instance, can be a structured dtype to map to the
+ columns of the file.
+ encoding : str, optional
+ Encoding used to decode the inputfile. The special value 'bytes'
+ (the default) enables backwards-compatible behavior for `converters`,
+ ensuring that inputs to the converter functions are encoded
+ bytes objects. The special value 'bytes' has no additional effect if
+ ``converters=None``. If encoding is ``'bytes'`` or ``None``, the
+ default system encoding is used.
+
+ Returns
+ -------
+ ndarray
+ NumPy array.
+
+ Examples
+ --------
+ First we create a file for the example.
+
+ >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n'
+ >>> with open('example1.csv', 'w') as f:
+ ... f.write(s1)
+ >>> a1 = read_from_filename('example1.csv')
+ >>> a1
+ array([[1., 2., 3.],
+ [4., 5., 6.]])
+
+ The second example has columns with different data types, so a
+ one-dimensional array with a structured data type is returned.
+ The tab character is used as the field delimiter.
+
+ >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n'
+ >>> with open('example2.tsv', 'w') as f:
+ ... f.write(s2)
+ >>> a2 = read_from_filename('example2.tsv', delimiter='\t')
+ >>> a2
+ array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')],
+ dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')])
+ """
+ # Handle special 'bytes' keyword for encoding
+ byte_converters = False
+ if encoding == 'bytes':
+ encoding = None
+ byte_converters = True
+
+ if dtype is None:
+ raise TypeError("a dtype must be provided.")
+ dtype = np.dtype(dtype)
+
+ read_dtype_via_object_chunks = None
+ if dtype.kind in 'SUM' and (
+ dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'):
+ # This is a legacy "flexible" dtype. We do not truly support
+ # parametric dtypes currently (no dtype discovery step in the core),
+ # but have to support these for backward compatibility.
+ read_dtype_via_object_chunks = dtype
+ dtype = np.dtype(object)
+
+ if usecols is not None:
+ # Allow usecols to be a single int or a sequence of ints, the C-code
+ # handles the rest
+ try:
+ usecols = list(usecols)
+ except TypeError:
+ usecols = [usecols]
+
+ _ensure_ndmin_ndarray_check_param(ndmin)
+
+ if comment is None:
+ comments = None
+ else:
+ # assume comments are a sequence of strings
+ if "" in comment:
+ raise ValueError(
+ "comments cannot be an empty string. Use comments=None to "
+ "disable comments."
+ )
+ comments = tuple(comment)
+ comment = None
+ if len(comments) == 0:
+ comments = None # No comments at all
+ elif len(comments) == 1:
+ # If there is only one comment, and that comment has one character,
+ # the normal parsing can deal with it just fine.
+ if isinstance(comments[0], str) and len(comments[0]) == 1:
+ comment = comments[0]
+ comments = None
+ else:
+ # Input validation if there are multiple comment characters
+ if delimiter in comments:
+ raise TypeError(
+ f"Comment characters '{comments}' cannot include the "
+ f"delimiter '{delimiter}'"
+ )
+
+ # comment is now either a 1 or 0 character string or a tuple:
+ if comments is not None:
+ # Note: An earlier version support two character comments (and could
+ # have been extended to multiple characters, we assume this is
+ # rare enough to not optimize for.
+ if quote is not None:
+ raise ValueError(
+ "when multiple comments or a multi-character comment is "
+ "given, quotes are not supported. In this case quotechar "
+ "must be set to None.")
+
+ if len(imaginary_unit) != 1:
+ raise ValueError('len(imaginary_unit) must be 1.')
+
+ _check_nonneg_int(skiplines)
+ if max_rows is not None:
+ _check_nonneg_int(max_rows)
+ else:
+ # Passing -1 to the C code means "read the entire file".
+ max_rows = -1
+
+ fh_closing_ctx = contextlib.nullcontext()
+ filelike = False
+ try:
+ if isinstance(fname, os.PathLike):
+ fname = os.fspath(fname)
+ if isinstance(fname, str):
+ fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+ if encoding is None:
+ encoding = getattr(fh, 'encoding', 'latin1')
+
+ fh_closing_ctx = contextlib.closing(fh)
+ data = fh
+ filelike = True
+ else:
+ if encoding is None:
+ encoding = getattr(fname, 'encoding', 'latin1')
+ data = iter(fname)
+ except TypeError as e:
+ raise ValueError(
+ f"fname must be a string, filehandle, list of strings,\n"
+ f"or generator. Got {type(fname)} instead.") from e
+
+ with fh_closing_ctx:
+ if comments is not None:
+ if filelike:
+ data = iter(data)
+ filelike = False
+ data = _preprocess_comments(data, comments, encoding)
+
+ if read_dtype_via_object_chunks is None:
+ arr = _load_from_filelike(
+ data, delimiter=delimiter, comment=comment, quote=quote,
+ imaginary_unit=imaginary_unit,
+ usecols=usecols, skiplines=skiplines, max_rows=max_rows,
+ converters=converters, dtype=dtype,
+ encoding=encoding, filelike=filelike,
+ byte_converters=byte_converters)
+
+ else:
+ # This branch reads the file into chunks of object arrays and then
+ # casts them to the desired actual dtype. This ensures correct
+ # string-length and datetime-unit discovery (like `arr.astype()`).
+ # Due to chunking, certain error reports are less clear, currently.
+ if filelike:
+ data = iter(data) # cannot chunk when reading from file
+
+ c_byte_converters = False
+ if read_dtype_via_object_chunks == "S":
+ c_byte_converters = True # Use latin1 rather than ascii
+
+ chunks = []
+ while max_rows != 0:
+ if max_rows < 0:
+ chunk_size = _loadtxt_chunksize
+ else:
+ chunk_size = min(_loadtxt_chunksize, max_rows)
+
+ next_arr = _load_from_filelike(
+ data, delimiter=delimiter, comment=comment, quote=quote,
+ imaginary_unit=imaginary_unit,
+ usecols=usecols, skiplines=skiplines, max_rows=max_rows,
+ converters=converters, dtype=dtype,
+ encoding=encoding, filelike=filelike,
+ byte_converters=byte_converters,
+ c_byte_converters=c_byte_converters)
+ # Cast here already. We hope that this is better even for
+ # large files because the storage is more compact. It could
+ # be adapted (in principle the concatenate could cast).
+ chunks.append(next_arr.astype(read_dtype_via_object_chunks))
+
+ skiprows = 0 # Only have to skip for first chunk
+ if max_rows >= 0:
+ max_rows -= chunk_size
+ if len(next_arr) < chunk_size:
+ # There was less data than requested, so we are done.
+ break
+
+ # Need at least one chunk, but if empty, the last one may have
+ # the wrong shape.
+ if len(chunks) > 1 and len(chunks[-1]) == 0:
+ del chunks[-1]
+ if len(chunks) == 1:
+ arr = chunks[0]
+ else:
+ arr = np.concatenate(chunks, axis=0)
+
+ # NOTE: ndmin works as advertised for structured dtypes, but normally
+ # these would return a 1D result plus the structured dimension,
+ # so ndmin=2 adds a third dimension even when no squeezing occurs.
+ # A `squeeze=False` could be a better solution (pandas uses squeeze).
+ arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin)
+
+ if arr.shape:
+ if arr.shape[0] == 0:
+ warnings.warn(
+ f'loadtxt: input contained no data: "{fname}"',
+ category=UserWarning,
+ stacklevel=3
+ )
+
+ if unpack:
+ # Unpack structured dtypes if requested:
+ dt = arr.dtype
+ if dt.names is not None:
+ # For structured arrays, return an array for each field.
+ return [arr[field] for field in dt.names]
+ else:
+ return arr.T
+ else:
+ return arr
+
+
@set_array_function_like_doc
@set_module('numpy')
def loadtxt(fname, dtype=float, comments='#', delimiter=None,
converters=None, skiprows=0, usecols=None, unpack=False,
- ndmin=0, encoding='bytes', max_rows=None, *, like=None):
+ ndmin=0, encoding='bytes', max_rows=None, *, quotechar=None,
+ like=None):
r"""
Load data from a text file.
@@ -882,19 +1102,20 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
each row will be interpreted as an element of the array. In this
case, the number of columns used must match the number of fields in
the data-type.
- comments : str or sequence of str, optional
+ comments : str or sequence of str or None, optional
The characters or list of characters used to indicate the start of a
comment. None implies no comments. For backwards compatibility, byte
strings will be decoded as 'latin1'. The default is '#'.
delimiter : str, optional
The string used to separate values. For backwards compatibility, byte
strings will be decoded as 'latin1'. The default is whitespace.
- converters : dict, optional
- A dictionary mapping column number to a function that will parse the
- column string into the desired value. E.g., if column 0 is a date
- string: ``converters = {0: datestr2num}``. Converters can also be
- used to provide a default value for missing data (but see also
- `genfromtxt`): ``converters = {3: lambda s: float(s.strip() or 0)}``.
+ converters : dict or callable, optional
+ A function to parse all columns strings into the desired value, or
+ a dictionary mapping column number to a parser function.
+ E.g. if column 0 is a date string: ``converters = {0: datestr2num}``.
+ Converters can also be used to provide a default value for missing
+ data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will
+ convert empty fields to 0.
Default: None.
skiprows : int, optional
Skip the first `skiprows` lines, including comments; default: 0.
@@ -932,6 +1153,16 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
is to read all the lines.
.. versionadded:: 1.16.0
+ quotechar : unicode character or None, optional
+ The character used to denote the start and end of a quoted item.
+ Occurrences of the delimiter or comment characters are ignored within
+ a quoted item. The default value is ``quotechar=None``, which means
+ quoting support is disabled.
+
+ If two consecutive instances of `quotechar` are found within a quoted
+ field, the first is treated as an escape character. See examples.
+
+ .. versionadded:: 1.23.0
${ARRAY_FUNCTION_LIKE}
.. versionadded:: 1.20.0
@@ -979,6 +1210,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
>>> y
array([2., 4.])
+ The `converters` argument is used to specify functions to preprocess the
+ text prior to parsing. `converters` can be a dictionary that maps
+ preprocessing functions to each column:
+
+ >>> s = StringIO("1.618, 2.296\n3.141, 4.669\n")
+ >>> conv = {
+ ... 0: lambda x: np.floor(float(x)), # conversion fn for column 0
+ ... 1: lambda x: np.ceil(float(x)), # conversion fn for column 1
+ ... }
+ >>> np.loadtxt(s, delimiter=",", converters=conv)
+ array([[1., 3.],
+ [3., 5.]])
+
+ `converters` can be a callable instead of a dictionary, in which case it
+ is applied to all columns:
+
+ >>> s = StringIO("0xDE 0xAD\n0xC0 0xDE")
+ >>> import functools
+ >>> conv = functools.partial(int, base=16)
+ >>> np.loadtxt(s, converters=conv)
+ array([[222., 173.],
+ [192., 222.]])
+
This example shows how `converters` can be used to convert a field
with a trailing minus sign into a negative number.
@@ -986,242 +1240,90 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
>>> def conv(fld):
... return -float(fld[:-1]) if fld.endswith(b'-') else float(fld)
...
- >>> np.loadtxt(s, converters={0: conv, 1: conv})
+ >>> np.loadtxt(s, converters=conv)
array([[ 10.01, -31.25],
[ 19.22, 64.31],
[-17.57, 63.94]])
- """
-
- if like is not None:
- return _loadtxt_with_like(
- fname, dtype=dtype, comments=comments, delimiter=delimiter,
- converters=converters, skiprows=skiprows, usecols=usecols,
- unpack=unpack, ndmin=ndmin, encoding=encoding,
- max_rows=max_rows, like=like
- )
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- # Nested functions used by loadtxt.
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ Using a callable as the converter can be particularly useful for handling
+ values with different formatting, e.g. floats with underscores:
- def split_line(line: str):
- """Chop off comments, strip, and split at delimiter."""
- for comment in comments: # Much faster than using a single regex.
- line = line.split(comment, 1)[0]
- line = line.strip('\r\n')
- return line.split(delimiter) if line else []
+ >>> s = StringIO("1 2.7 100_000")
+ >>> np.loadtxt(s, converters=float)
+ array([1.e+00, 2.7e+00, 1.e+05])
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- # Main body of loadtxt.
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- _ensure_ndmin_ndarray_check_param(ndmin)
-
- # Type conversions for Py3 convenience
- if comments is not None:
- if isinstance(comments, (str, bytes)):
- comments = [comments]
- comments = [_decode_line(x) for x in comments]
- else:
- comments = []
-
- if delimiter is not None:
- delimiter = _decode_line(delimiter)
-
- user_converters = converters
-
- byte_converters = False
- if encoding == 'bytes':
- encoding = None
- byte_converters = True
-
- if usecols is not None:
- # Copy usecols, allowing it to be a single int or a sequence of ints.
- try:
- usecols = list(usecols)
- except TypeError:
- usecols = [usecols]
- for i, col_idx in enumerate(usecols):
- try:
- usecols[i] = opindex(col_idx) # Cast to builtin int now.
- except TypeError as e:
- e.args = (
- "usecols must be an int or a sequence of ints but "
- "it contains at least one element of type %s" %
- type(col_idx),
- )
- raise
- if len(usecols) > 1:
- usecols_getter = itemgetter(*usecols)
- else:
- # Get an iterable back, even if using a single column.
- usecols_getter = lambda obj, c=usecols[0]: [obj[c]]
- else:
- usecols_getter = None
+ This idea can be extended to automatically handle values specified in
+ many different formats:
- # Make sure we're dealing with a proper dtype
- dtype = np.dtype(dtype)
- defconv = _getconv(dtype)
+ >>> def conv(val):
+ ... try:
+ ... return float(val)
+ ... except ValueError:
+ ... return float.fromhex(val)
+ >>> s = StringIO("1, 2.5, 3_000, 0b4, 0x1.4000000000000p+2")
+ >>> np.loadtxt(s, delimiter=",", converters=conv, encoding=None)
+ array([1.0e+00, 2.5e+00, 3.0e+03, 1.8e+02, 5.0e+00])
- dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype)
+ Note that with the default ``encoding="bytes"``, the inputs to the
+ converter function are latin-1 encoded byte strings. To deactivate the
+ implicit encoding prior to conversion, use ``encoding=None``
- fh_closing_ctx = contextlib.nullcontext()
- try:
- if isinstance(fname, os_PathLike):
- fname = os_fspath(fname)
- if _is_string_like(fname):
- fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
- fencoding = getattr(fh, 'encoding', 'latin1')
- line_iter = iter(fh)
- fh_closing_ctx = contextlib.closing(fh)
- else:
- line_iter = iter(fname)
- fencoding = getattr(fname, 'encoding', 'latin1')
- try:
- first_line = next(line_iter)
- except StopIteration:
- pass # Nothing matters if line_iter is empty.
- else:
- # Put first_line back.
- line_iter = itertools.chain([first_line], line_iter)
- if isinstance(first_line, bytes):
- # Using latin1 matches _decode_line's behavior.
- decoder = methodcaller(
- "decode",
- encoding if encoding is not None else "latin1")
- line_iter = map(decoder, line_iter)
- except TypeError as e:
- raise ValueError(
- f"fname must be a string, filehandle, list of strings,\n"
- f"or generator. Got {type(fname)} instead."
- ) from e
+ >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94')
+ >>> conv = lambda x: -float(x[:-1]) if x.endswith('-') else float(x)
+ >>> np.loadtxt(s, converters=conv, encoding=None)
+ array([[ 10.01, -31.25],
+ [ 19.22, 64.31],
+ [-17.57, 63.94]])
- with fh_closing_ctx:
+ Support for quoted fields is enabled with the `quotechar` parameter.
+ Comment and delimiter characters are ignored when they appear within a
+ quoted item delineated by `quotechar`:
- # input may be a python2 io stream
- if encoding is not None:
- fencoding = encoding
- # we must assume local encoding
- # TODO emit portability warning?
- elif fencoding is None:
- import locale
- fencoding = locale.getpreferredencoding()
-
- # Skip the first `skiprows` lines
- for i in range(skiprows):
- next(line_iter)
-
- # Read until we find a line with some values, and use it to determine
- # the need for decoding and estimate the number of columns.
- for first_line in line_iter:
- ncols = len(usecols or split_line(first_line))
- if ncols:
- # Put first_line back.
- line_iter = itertools.chain([first_line], line_iter)
- break
- else: # End of lines reached
- ncols = len(usecols or [])
- warnings.warn('loadtxt: Empty input file: "%s"' % fname,
- stacklevel=2)
-
- line_iter = itertools.islice(line_iter, max_rows)
- lineno_words_iter = filter(
- itemgetter(1), # item[1] is words; filter skips empty lines.
- enumerate(map(split_line, line_iter), 1 + skiprows))
-
- # Now that we know ncols, create the default converters list, and
- # set packing, if necessary.
- if len(dtype_types) > 1:
- # We're dealing with a structured array, each field of
- # the dtype matches a column
- converters = [_getconv(dt) for dt in dtype_types]
- else:
- # All fields have the same dtype; use specialized packers which are
- # much faster than those using _loadtxt_pack_items.
- converters = [defconv for i in range(ncols)]
- if ncols == 1:
- packer = itemgetter(0)
- else:
- def packer(row): return row
+ >>> s = StringIO('"alpha, #42", 10.0\n"beta, #64", 2.0\n')
+ >>> dtype = np.dtype([("label", "U12"), ("value", float)])
+ >>> np.loadtxt(s, dtype=dtype, delimiter=",", quotechar='"')
+ array([('alpha, #42', 10.), ('beta, #64', 2.)],
+ dtype=[('label', '<U12'), ('value', '<f8')])
- # By preference, use the converters specified by the user
- for i, conv in (user_converters or {}).items():
- if usecols:
- try:
- i = usecols.index(i)
- except ValueError:
- # Unused converter specified
- continue
- if byte_converters:
- # converters may use decode to workaround numpy's old
- # behaviour, so encode the string again (converters are only
- # called with strings) before passing to the user converter.
- def tobytes_first(conv, x):
- return conv(x.encode("latin1"))
- converters[i] = functools.partial(tobytes_first, conv)
- else:
- converters[i] = conv
-
- fencode = methodcaller("encode", fencoding)
- converters = [conv if conv is not bytes else fencode
- for conv in converters]
- if len(set(converters)) == 1:
- # Optimize single-type data. Note that this is only reached if
- # `_getconv` returns equal callables (i.e. not local lambdas) on
- # equal dtypes.
- def convert_row(vals, _conv=converters[0]):
- return [*map(_conv, vals)]
- else:
- def convert_row(vals):
- return [conv(val) for conv, val in zip(converters, vals)]
-
- # read data in chunks and fill it into an array via resize
- # over-allocating and shrinking the array later may be faster but is
- # probably not relevant compared to the cost of actually reading and
- # converting the data
- X = None
- while True:
- chunk = []
- for lineno, words in itertools.islice(
- lineno_words_iter, _loadtxt_chunksize):
- if usecols_getter is not None:
- words = usecols_getter(words)
- elif len(words) != ncols:
- raise ValueError(
- f"Wrong number of columns at line {lineno}")
- # Convert each value according to its column, then pack it
- # according to the dtype's nesting, and store it.
- chunk.append(packer(convert_row(words)))
- if not chunk: # The islice is empty, i.e. we're done.
- break
+ Two consecutive quote characters within a quoted field are treated as a
+ single escaped character:
- if X is None:
- X = np.array(chunk, dtype)
- else:
- nshape = list(X.shape)
- pos = nshape[0]
- nshape[0] += len(chunk)
- X.resize(nshape, refcheck=False)
- X[pos:, ...] = chunk
+ >>> s = StringIO('"Hello, my name is ""Monty""!"')
+ >>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"')
+ array('Hello, my name is "Monty"!', dtype='<U26')
- if X is None:
- X = np.array([], dtype)
+ """
- # Multicolumn data are returned with shape (1, N, M), i.e.
- # (1, 1, M) for a single row - remove the singleton dimension there
- if X.ndim == 3 and X.shape[:2] == (1, 1):
- X.shape = (1, -1)
+ if like is not None:
+ return _loadtxt_with_like(
+ fname, dtype=dtype, comments=comments, delimiter=delimiter,
+ converters=converters, skiprows=skiprows, usecols=usecols,
+ unpack=unpack, ndmin=ndmin, encoding=encoding,
+ max_rows=max_rows, like=like
+ )
- X = _ensure_ndmin_ndarray(X, ndmin=ndmin)
+ if isinstance(delimiter, bytes):
+ delimiter.decode("latin1")
- if unpack:
- if len(dtype_types) > 1:
- # For structured arrays, return an array for each field.
- return [X[field] for field in dtype.names]
- else:
- return X.T
- else:
- return X
+ if dtype is None:
+ dtype = np.float64
+
+ comment = comments
+ # Control character type conversions for Py3 convenience
+ if comment is not None:
+ if isinstance(comment, (str, bytes)):
+ comment = [comment]
+ comment = [
+ x.decode('latin1') if isinstance(x, bytes) else x for x in comment]
+ if isinstance(delimiter, bytes):
+ delimiter = delimiter.decode('latin1')
+
+ arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter,
+ converters=converters, skiplines=skiprows, usecols=usecols,
+ unpack=unpack, ndmin=ndmin, encoding=encoding,
+ max_rows=max_rows, quote=quotechar)
+
+ return arr
_loadtxt_with_like = array_function_dispatch(
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index b9b10bc06..a2758123b 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -695,7 +695,7 @@ class TestLoadTxt(LoadTxtBase):
assert_array_equal(x, a)
d = TextIO()
- d.write('M 64.0 75.0\nF 25.0 60.0')
+ d.write('M 64 75.0\nF 25 60.0')
d.seek(0)
mydescriptor = {'names': ('gender', 'age', 'weight'),
'formats': ('S1', 'i4', 'f4')}
@@ -779,6 +779,8 @@ class TestLoadTxt(LoadTxtBase):
a = np.array([[1, 2, 3], [4, 5, 6]], int)
assert_array_equal(x, a)
+ @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
def test_comments_multi_chars(self):
c = TextIO()
c.write('/* comment\n1,2,3,5\n')
@@ -871,16 +873,27 @@ class TestLoadTxt(LoadTxtBase):
bogus_idx = 1.5
assert_raises_regex(
TypeError,
- '^usecols must be.*%s' % type(bogus_idx),
+ '^usecols must be.*%s' % type(bogus_idx).__name__,
np.loadtxt, c, usecols=bogus_idx
)
assert_raises_regex(
TypeError,
- '^usecols must be.*%s' % type(bogus_idx),
+ '^usecols must be.*%s' % type(bogus_idx).__name__,
np.loadtxt, c, usecols=[0, bogus_idx, 0]
)
+ def test_bad_usecols(self):
+ with pytest.raises(OverflowError):
+ np.loadtxt(["1\n"], usecols=[2**64], delimiter=",")
+ with pytest.raises((ValueError, OverflowError)):
+ # Overflow error on 32bit platforms
+ np.loadtxt(["1\n"], usecols=[2**62], delimiter=",")
+ with pytest.raises(TypeError,
+ match="If a structured dtype .*. But 1 usecols were given and "
+ "the number of fields is 3."):
+ np.loadtxt(["1,1\n"], dtype="i,(2)i", usecols=[0], delimiter=",")
+
def test_fancy_dtype(self):
c = TextIO()
c.write('1,2,3.0\n4,5,6.0\n')
@@ -919,8 +932,7 @@ class TestLoadTxt(LoadTxtBase):
assert_array_equal(x, a)
def test_empty_file(self):
- with suppress_warnings() as sup:
- sup.filter(message="loadtxt: Empty input file:")
+ with pytest.warns(UserWarning, match="input contained no data"):
c = TextIO()
x = np.loadtxt(c)
assert_equal(x.shape, (0,))
@@ -981,29 +993,32 @@ class TestLoadTxt(LoadTxtBase):
c.write(inp)
for dt in [float, np.float32]:
c.seek(0)
- res = np.loadtxt(c, dtype=dt)
+ res = np.loadtxt(
+ c, dtype=dt, converters=float.fromhex, encoding="latin1")
assert_equal(res, tgt, err_msg="%s" % dt)
+ @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
def test_default_float_converter_no_default_hex_conversion(self):
"""
Ensure that fromhex is only used for values with the correct prefix and
is not called by default. Regression test related to gh-19598.
"""
c = TextIO("a b c")
- with pytest.raises(
- ValueError, match="could not convert string to float"
- ):
+ with pytest.raises(ValueError,
+ match=".*convert string 'a' to float64 at row 0, column 1"):
np.loadtxt(c)
+ @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
def test_default_float_converter_exception(self):
"""
Ensure that the exception message raised during failed floating point
conversion is correct. Regression test related to gh-19598.
"""
c = TextIO("qrs tuv") # Invalid values for default float converter
- with pytest.raises(
- ValueError, match="could not convert string to float"
- ):
+ with pytest.raises(ValueError,
+ match="could not convert string 'qrs' to float64"):
np.loadtxt(c)
def test_from_complex(self):
@@ -1099,8 +1114,7 @@ class TestLoadTxt(LoadTxtBase):
assert_(x.shape == (3,))
# Test ndmin kw with empty file.
- with suppress_warnings() as sup:
- sup.filter(message="loadtxt: Empty input file:")
+ with pytest.warns(UserWarning, match="input contained no data"):
f = TextIO()
assert_(np.loadtxt(f, ndmin=2).shape == (0, 1,))
assert_(np.loadtxt(f, ndmin=1).shape == (0,))
@@ -1132,8 +1146,8 @@ class TestLoadTxt(LoadTxtBase):
@pytest.mark.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968',
reason="Wrong preferred encoding")
def test_binary_load(self):
- butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\
- b"20,2,3,\xc3\x95scar\n\r"
+ butf8 = b"5,6,7,\xc3\x95scarscar\r\n15,2,3,hello\r\n"\
+ b"20,2,3,\xc3\x95scar\r\n"
sutf8 = butf8.decode("UTF-8").replace("\r", "").splitlines()
with temppath() as path:
with open(path, "wb") as f:
@@ -1196,6 +1210,30 @@ class TestLoadTxt(LoadTxtBase):
a = np.array([[1, 2, 3, 5], [4, 5, 7, 8], [2, 1, 4, 5]], int)
assert_array_equal(x, a)
+ @pytest.mark.parametrize(["skip", "data"], [
+ (1, ["ignored\n", "1,2\n", "\n", "3,4\n"]),
+ # "Bad" lines that do not end in newlines:
+ (1, ["ignored", "1,2", "", "3,4"]),
+ (1, StringIO("ignored\n1,2\n\n3,4")),
+ # Same as above, but do not skip any lines:
+ (0, ["-1,0\n", "1,2\n", "\n", "3,4\n"]),
+ (0, ["-1,0", "1,2", "", "3,4"]),
+ (0, StringIO("-1,0\n1,2\n\n3,4"))])
+ def test_max_rows_empty_lines(self, skip, data):
+ with pytest.warns(UserWarning,
+ match=f"Input line 3.*max_rows={3-skip}"):
+ res = np.loadtxt(data, dtype=int, skiprows=skip, delimiter=",",
+ max_rows=3-skip)
+ assert_array_equal(res, [[-1, 0], [1, 2], [3, 4]][skip:])
+
+ if isinstance(data, StringIO):
+ data.seek(0)
+
+ with warnings.catch_warnings():
+ warnings.simplefilter("error", UserWarning)
+ with pytest.raises(UserWarning):
+ np.loadtxt(data, dtype=int, skiprows=skip, delimiter=",",
+ max_rows=3-skip)
class Testfromregex:
def test_record(self):
@@ -2397,6 +2435,13 @@ M 33 21.99
assert_equal(test['f1'], 17179869184)
assert_equal(test['f2'], 1024)
+ def test_unpack_float_data(self):
+ txt = TextIO("1,2,3\n4,5,6\n7,8,9\n0.0,1.0,2.0")
+ a, b, c = np.loadtxt(txt, delimiter=",", unpack=True)
+ assert_array_equal(a, np.array([1.0, 4.0, 7.0, 0.0]))
+ assert_array_equal(b, np.array([2.0, 5.0, 8.0, 1.0]))
+ assert_array_equal(c, np.array([3.0, 6.0, 9.0, 2.0]))
+
def test_unpack_structured(self):
# Regression test for gh-4341
# Unpacking should work on structured arrays
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
new file mode 100644
index 000000000..cca328b16
--- /dev/null
+++ b/numpy/lib/tests/test_loadtxt.py
@@ -0,0 +1,1002 @@
+"""
+Tests specific to `np.loadtxt` added during the move of loadtxt to be backed
+by C code.
+These tests complement those found in `test_io.py`.
+"""
+
+import sys
+import pytest
+from tempfile import NamedTemporaryFile, mkstemp
+from io import StringIO
+
+import numpy as np
+from numpy.ma.testutils import assert_equal
+from numpy.testing import assert_array_equal, HAS_REFCOUNT, IS_PYPY
+
+
+def test_scientific_notation():
+ """Test that both 'e' and 'E' are parsed correctly."""
+ data = StringIO(
+ (
+ "1.0e-1,2.0E1,3.0\n"
+ "4.0e-2,5.0E-1,6.0\n"
+ "7.0e-3,8.0E1,9.0\n"
+ "0.0e-4,1.0E-1,2.0"
+ )
+ )
+ expected = np.array(
+ [[0.1, 20., 3.0], [0.04, 0.5, 6], [0.007, 80., 9], [0, 0.1, 2]]
+ )
+ assert_array_equal(np.loadtxt(data, delimiter=","), expected)
+
+
+@pytest.mark.parametrize("comment", ["..", "//", "@-", "this is a comment:"])
+def test_comment_multiple_chars(comment):
+ content = "# IGNORE\n1.5, 2.5# ABC\n3.0,4.0# XXX\n5.5,6.0\n"
+ txt = StringIO(content.replace("#", comment))
+ a = np.loadtxt(txt, delimiter=",", comments=comment)
+ assert_equal(a, [[1.5, 2.5], [3.0, 4.0], [5.5, 6.0]])
+
+
+@pytest.fixture
+def mixed_types_structured():
+ """
+ Fixture providing hetergeneous input data with a structured dtype, along
+ with the associated structured array.
+ """
+ data = StringIO(
+ (
+ "1000;2.4;alpha;-34\n"
+ "2000;3.1;beta;29\n"
+ "3500;9.9;gamma;120\n"
+ "4090;8.1;delta;0\n"
+ "5001;4.4;epsilon;-99\n"
+ "6543;7.8;omega;-1\n"
+ )
+ )
+ dtype = np.dtype(
+ [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)]
+ )
+ expected = np.array(
+ [
+ (1000, 2.4, "alpha", -34),
+ (2000, 3.1, "beta", 29),
+ (3500, 9.9, "gamma", 120),
+ (4090, 8.1, "delta", 0),
+ (5001, 4.4, "epsilon", -99),
+ (6543, 7.8, "omega", -1)
+ ],
+ dtype=dtype
+ )
+ return data, dtype, expected
+
+
+@pytest.mark.parametrize('skiprows', [0, 1, 2, 3])
+def test_structured_dtype_and_skiprows_no_empty_lines(
+ skiprows, mixed_types_structured):
+ data, dtype, expected = mixed_types_structured
+ a = np.loadtxt(data, dtype=dtype, delimiter=";", skiprows=skiprows)
+ assert_array_equal(a, expected[skiprows:])
+
+
+def test_unpack_structured(mixed_types_structured):
+ data, dtype, expected = mixed_types_structured
+
+ a, b, c, d = np.loadtxt(data, dtype=dtype, delimiter=";", unpack=True)
+ assert_array_equal(a, expected["f0"])
+ assert_array_equal(b, expected["f1"])
+ assert_array_equal(c, expected["f2"])
+ assert_array_equal(d, expected["f3"])
+
+
+def test_structured_dtype_with_shape():
+ dtype = np.dtype([("a", "u1", 2), ("b", "u1", 2)])
+ data = StringIO("0,1,2,3\n6,7,8,9\n")
+ expected = np.array([((0, 1), (2, 3)), ((6, 7), (8, 9))], dtype=dtype)
+ assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dtype), expected)
+
+
+def test_structured_dtype_with_multi_shape():
+ dtype = np.dtype([("a", "u1", (2, 2))])
+ data = StringIO("0 1 2 3\n")
+ expected = np.array([(((0, 1), (2, 3)),)], dtype=dtype)
+ assert_array_equal(np.loadtxt(data, dtype=dtype), expected)
+
+
+def test_nested_structured_subarray():
+ # Test from gh-16678
+ point = np.dtype([('x', float), ('y', float)])
+ dt = np.dtype([('code', int), ('points', point, (2,))])
+ data = StringIO("100,1,2,3,4\n200,5,6,7,8\n")
+ expected = np.array(
+ [
+ (100, [(1., 2.), (3., 4.)]),
+ (200, [(5., 6.), (7., 8.)]),
+ ],
+ dtype=dt
+ )
+ assert_array_equal(np.loadtxt(data, dtype=dt, delimiter=","), expected)
+
+
+def test_structured_dtype_offsets():
+ # An aligned structured dtype will have additional padding
+ dt = np.dtype("i1, i4, i1, i4, i1, i4", align=True)
+ data = StringIO("1,2,3,4,5,6\n7,8,9,10,11,12\n")
+ expected = np.array([(1, 2, 3, 4, 5, 6), (7, 8, 9, 10, 11, 12)], dtype=dt)
+ assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dt), expected)
+
+
+@pytest.mark.parametrize("param", ("skiprows", "max_rows"))
+def test_exception_negative_row_limits(param):
+ """skiprows and max_rows should raise for negative parameters."""
+ with pytest.raises(ValueError, match="argument must be nonnegative"):
+ np.loadtxt("foo.bar", **{param: -3})
+
+
+@pytest.mark.parametrize("param", ("skiprows", "max_rows"))
+def test_exception_noninteger_row_limits(param):
+ with pytest.raises(TypeError, match="argument must be an integer"):
+ np.loadtxt("foo.bar", **{param: 1.0})
+
+
+@pytest.mark.parametrize(
+ "data, shape",
+ [
+ ("1 2 3 4 5\n", (1, 5)), # Single row
+ ("1\n2\n3\n4\n5\n", (5, 1)), # Single column
+ ]
+)
+def test_ndmin_single_row_or_col(data, shape):
+ arr = np.array([1, 2, 3, 4, 5])
+ arr2d = arr.reshape(shape)
+
+ assert_array_equal(np.loadtxt(StringIO(data), dtype=int), arr)
+ assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=0), arr)
+ assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=1), arr)
+ assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=2), arr2d)
+
+
+@pytest.mark.parametrize("badval", [-1, 3, None, "plate of shrimp"])
+def test_bad_ndmin(badval):
+ with pytest.raises(ValueError, match="Illegal value of ndmin keyword"):
+ np.loadtxt("foo.bar", ndmin=badval)
+
+
+@pytest.mark.parametrize(
+ "ws",
+ (
+ "\t", # tab
+ "\u2003", # em
+ "\u00A0", # non-break
+ "\u3000", # ideographic space
+ )
+)
+def test_blank_lines_spaces_delimit(ws):
+ txt = StringIO(
+ f"1 2{ws}30\n\n4 5 60\n {ws} \n7 8 {ws} 90\n # comment\n3 2 1"
+ )
+ # NOTE: It is unclear that the ` # comment` should succeed. Except
+ # for delimiter=None, which should use any whitespace (and maybe
+ # should just be implemented closer to Python
+ expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]])
+ assert_equal(
+ np.loadtxt(txt, dtype=int, delimiter=None, comments="#"), expected
+ )
+
+
+def test_blank_lines_normal_delimiter():
+ txt = StringIO('1,2,30\n\n4,5,60\n\n7,8,90\n# comment\n3,2,1')
+ expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]])
+ assert_equal(
+ np.loadtxt(txt, dtype=int, delimiter=',', comments="#"), expected
+ )
+
+
+@pytest.mark.parametrize("dtype", (float, object))
+def test_maxrows_no_blank_lines(dtype):
+ txt = StringIO("1.5,2.5\n3.0,4.0\n5.5,6.0")
+ res = np.loadtxt(txt, dtype=dtype, delimiter=",", max_rows=2)
+ assert_equal(res.dtype, dtype)
+ assert_equal(res, np.array([["1.5", "2.5"], ["3.0", "4.0"]], dtype=dtype))
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+@pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2")))
+def test_exception_message_bad_values(dtype):
+ txt = StringIO("1,2\n3,XXX\n5,6")
+ msg = f"could not convert string 'XXX' to {dtype} at row 1, column 2"
+ with pytest.raises(ValueError, match=msg):
+ np.loadtxt(txt, dtype=dtype, delimiter=",")
+
+
+def test_converters_negative_indices():
+ txt = StringIO('1.5,2.5\n3.0,XXX\n5.5,6.0')
+ conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)}
+ expected = np.array([[1.5, 2.5], [3.0, np.nan], [5.5, 6.0]])
+ res = np.loadtxt(
+ txt, dtype=np.float64, delimiter=",", converters=conv, encoding=None
+ )
+ assert_equal(res, expected)
+
+
+def test_converters_negative_indices_with_usecols():
+ txt = StringIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n')
+ conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)}
+ expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]])
+ res = np.loadtxt(
+ txt,
+ dtype=np.float64,
+ delimiter=",",
+ converters=conv,
+ usecols=[0, -1],
+ encoding=None,
+ )
+ assert_equal(res, expected)
+
+ # Second test with variable number of rows:
+ res = np.loadtxt(StringIO('''0,1,2\n0,1,2,3,4'''), delimiter=",",
+ usecols=[0, -1], converters={-1: (lambda x: -1)})
+ assert_array_equal(res, [[0, -1], [0, -1]])
+
+def test_ragged_usecols():
+ # usecols, and negative ones, work even with varying number of columns.
+ txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n")
+ expected = np.array([[0, 0], [0, 0], [0, 0]])
+ res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2])
+ assert_equal(res, expected)
+
+ txt = StringIO("0,0,XXX\n0\n0,XXX,XXX,0,XXX\n")
+ with pytest.raises(ValueError,
+ match="invalid column index -2 at row 1 with 2 columns"):
+ # There is no -2 column in the second row:
+ np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2])
+
+
+def test_empty_usecols():
+ txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n")
+ res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=[])
+ assert res.shape == (3,)
+ assert res.dtype == np.dtype([])
+
+
+@pytest.mark.parametrize("c1", ["a", "の", "🫕"])
+@pytest.mark.parametrize("c2", ["a", "の", "🫕"])
+def test_large_unicode_characters(c1, c2):
+ # c1 and c2 span ascii, 16bit and 32bit range.
+ txt = StringIO(f"a,{c1},c,1.0\ne,{c2},2.0,g")
+ res = np.loadtxt(txt, dtype=np.dtype('U12'), delimiter=",")
+ expected = np.array(
+ [f"a,{c1},c,1.0".split(","), f"e,{c2},2.0,g".split(",")],
+ dtype=np.dtype('U12')
+ )
+ assert_equal(res, expected)
+
+
+def test_unicode_with_converter():
+ txt = StringIO("cat,dog\nαβγ,δεζ\nabc,def\n")
+ conv = {0: lambda s: s.upper()}
+ res = np.loadtxt(
+ txt,
+ dtype=np.dtype("U12"),
+ converters=conv,
+ delimiter=",",
+ encoding=None
+ )
+ expected = np.array([['CAT', 'dog'], ['ΑΒΓ', 'δεζ'], ['ABC', 'def']])
+ assert_equal(res, expected)
+
+
+def test_converter_with_structured_dtype():
+ txt = StringIO('1.5,2.5,Abc\n3.0,4.0,dEf\n5.5,6.0,ghI\n')
+ dt = np.dtype([('m', np.int32), ('r', np.float32), ('code', 'U8')])
+ conv = {0: lambda s: int(10*float(s)), -1: lambda s: s.upper()}
+ res = np.loadtxt(txt, dtype=dt, delimiter=",", converters=conv)
+ expected = np.array(
+ [(15, 2.5, 'ABC'), (30, 4.0, 'DEF'), (55, 6.0, 'GHI')], dtype=dt
+ )
+ assert_equal(res, expected)
+
+
+def test_converter_with_unicode_dtype():
+ """
+ With the default 'bytes' encoding, tokens are encoded prior to being
+ passed to the converter. This means that the output of the converter may
+ be bytes instead of unicode as expected by `read_rows`.
+
+ This test checks that outputs from the above scenario are properly decoded
+ prior to parsing by `read_rows`.
+ """
+ txt = StringIO('abc,def\nrst,xyz')
+ conv = bytes.upper
+ res = np.loadtxt(
+ txt, dtype=np.dtype("U3"), converters=conv, delimiter=",")
+ expected = np.array([['ABC', 'DEF'], ['RST', 'XYZ']])
+ assert_equal(res, expected)
+
+
+def test_read_huge_row():
+ row = "1.5, 2.5," * 50000
+ row = row[:-1] + "\n"
+ txt = StringIO(row * 2)
+ res = np.loadtxt(txt, delimiter=",", dtype=float)
+ assert_equal(res, np.tile([1.5, 2.5], (2, 50000)))
+
+
+@pytest.mark.parametrize("dtype", "edfgFDG")
+def test_huge_float(dtype):
+ # Covers a non-optimized path that is rarely taken:
+ field = "0" * 1000 + ".123456789"
+ dtype = np.dtype(dtype)
+ value = np.loadtxt([field], dtype=dtype)[()]
+ assert value == dtype.type("0.123456789")
+
+
+@pytest.mark.parametrize(
+ ("given_dtype", "expected_dtype"),
+ [
+ ("S", np.dtype("S5")),
+ ("U", np.dtype("U5")),
+ ],
+)
+def test_string_no_length_given(given_dtype, expected_dtype):
+ """
+ The given dtype is just 'S' or 'U' with no length. In these cases, the
+ length of the resulting dtype is determined by the longest string found
+ in the file.
+ """
+ txt = StringIO("AAA,5-1\nBBBBB,0-3\nC,4-9\n")
+ res = np.loadtxt(txt, dtype=given_dtype, delimiter=",")
+ expected = np.array(
+ [['AAA', '5-1'], ['BBBBB', '0-3'], ['C', '4-9']], dtype=expected_dtype
+ )
+ assert_equal(res, expected)
+ assert_equal(res.dtype, expected_dtype)
+
+
+def test_float_conversion():
+ """
+ Some tests that the conversion to float64 works as accurately as the
+ Python built-in `float` function. In a naive version of the float parser,
+ these strings resulted in values that were off by an ULP or two.
+ """
+ strings = [
+ '0.9999999999999999',
+ '9876543210.123456',
+ '5.43215432154321e+300',
+ '0.901',
+ '0.333',
+ ]
+ txt = StringIO('\n'.join(strings))
+ res = np.loadtxt(txt)
+ expected = np.array([float(s) for s in strings])
+ assert_equal(res, expected)
+
+
+def test_bool():
+ # Simple test for bool via integer
+ txt = StringIO("1, 0\n10, -1")
+ res = np.loadtxt(txt, dtype=bool, delimiter=",")
+ assert res.dtype == bool
+ assert_array_equal(res, [[True, False], [True, True]])
+ # Make sure we use only 1 and 0 on the byte level:
+ assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]])
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"])
+def test_integer_signs(dtype):
+ dtype = np.dtype(dtype)
+ assert np.loadtxt(["+2"], dtype=dtype) == 2
+ if dtype.kind == "u":
+ with pytest.raises(ValueError):
+ np.loadtxt(["-1\n"], dtype=dtype)
+ else:
+ assert np.loadtxt(["-2\n"], dtype=dtype) == -2
+
+ for sign in ["++", "+-", "--", "-+"]:
+ with pytest.raises(ValueError):
+ np.loadtxt([f"{sign}2\n"], dtype=dtype)
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"])
+def test_implicit_cast_float_to_int_fails(dtype):
+ txt = StringIO("1.0, 2.1, 3.7\n4, 5, 6")
+ with pytest.raises(ValueError):
+ np.loadtxt(txt, dtype=dtype, delimiter=",")
+
+@pytest.mark.parametrize("dtype", (np.complex64, np.complex128))
+@pytest.mark.parametrize("with_parens", (False, True))
+def test_complex_parsing(dtype, with_parens):
+ s = "(1.0-2.5j),3.75,(7+-5.0j)\n(4),(-19e2j),(0)"
+ if not with_parens:
+ s = s.replace("(", "").replace(")", "")
+
+ res = np.loadtxt(StringIO(s), dtype=dtype, delimiter=",")
+ expected = np.array(
+ [[1.0-2.5j, 3.75, 7-5j], [4.0, -1900j, 0]], dtype=dtype
+ )
+ assert_equal(res, expected)
+
+
+def test_read_from_generator():
+ def gen():
+ for i in range(4):
+ yield f"{i},{2*i},{i**2}"
+
+ res = np.loadtxt(gen(), dtype=int, delimiter=",")
+ expected = np.array([[0, 0, 0], [1, 2, 1], [2, 4, 4], [3, 6, 9]])
+ assert_equal(res, expected)
+
+
+def test_read_from_generator_multitype():
+ def gen():
+ for i in range(3):
+ yield f"{i} {i / 4}"
+
+ res = np.loadtxt(gen(), dtype="i, d", delimiter=" ")
+ expected = np.array([(0, 0.0), (1, 0.25), (2, 0.5)], dtype="i, d")
+ assert_equal(res, expected)
+
+
+def test_read_from_bad_generator():
+ def gen():
+ for entry in ["1,2", b"3, 5", 12738]:
+ yield entry
+
+ with pytest.raises(
+ TypeError, match=r"non-string returned while reading data"):
+ np.loadtxt(gen(), dtype="i, i", delimiter=",")
+
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+def test_object_cleanup_on_read_error():
+ sentinel = object()
+ already_read = 0
+
+ def conv(x):
+ nonlocal already_read
+ if already_read > 4999:
+ raise ValueError("failed half-way through!")
+ already_read += 1
+ return sentinel
+
+ txt = StringIO("x\n" * 10000)
+
+ with pytest.raises(ValueError, match="at row 5000, column 1"):
+ np.loadtxt(txt, dtype=object, converters={0: conv})
+
+ assert sys.getrefcount(sentinel) == 2
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+def test_character_not_bytes_compatible():
+ """Test exception when a character cannot be encoded as 'S'."""
+ data = StringIO("–") # == \u2013
+ with pytest.raises(ValueError):
+ np.loadtxt(data, dtype="S5")
+
+
+@pytest.mark.parametrize("conv", (0, [float], ""))
+def test_invalid_converter(conv):
+ msg = (
+ "converters must be a dictionary mapping columns to converter "
+ "functions or a single callable."
+ )
+ with pytest.raises(TypeError, match=msg):
+ np.loadtxt(StringIO("1 2\n3 4"), converters=conv)
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+def test_converters_dict_raises_non_integer_key():
+ with pytest.raises(TypeError, match="keys of the converters dict"):
+ np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int})
+ with pytest.raises(TypeError, match="keys of the converters dict"):
+ np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}, usecols=0)
+
+
+@pytest.mark.parametrize("bad_col_ind", (3, -3))
+def test_converters_dict_raises_non_col_key(bad_col_ind):
+ data = StringIO("1 2\n3 4")
+ with pytest.raises(ValueError, match="converter specified for column"):
+ np.loadtxt(data, converters={bad_col_ind: int})
+
+
+def test_converters_dict_raises_val_not_callable():
+ with pytest.raises(TypeError,
+ match="values of the converters dictionary must be callable"):
+ np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1})
+
+
+@pytest.mark.parametrize("q", ('"', "'", "`"))
+def test_quoted_field(q):
+ txt = StringIO(
+ f"{q}alpha, x{q}, 2.5\n{q}beta, y{q}, 4.5\n{q}gamma, z{q}, 5.0\n"
+ )
+ dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)])
+ expected = np.array(
+ [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype
+ )
+
+ res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q)
+ assert_array_equal(res, expected)
+
+
+def test_quote_support_default():
+ """Support for quoted fields is disabled by default."""
+ txt = StringIO('"lat,long", 45, 30\n')
+ dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)])
+
+ with pytest.raises(ValueError, match="the number of columns changed"):
+ np.loadtxt(txt, dtype=dtype, delimiter=",")
+
+ # Enable quoting support with non-None value for quotechar param
+ txt.seek(0)
+ expected = np.array([("lat,long", 45., 30.)], dtype=dtype)
+
+ res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"')
+ assert_array_equal(res, expected)
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+def test_quotechar_multichar_error():
+ txt = StringIO("1,2\n3,4")
+ msg = r".*must be a single unicode character or None"
+ with pytest.raises(TypeError, match=msg):
+ np.loadtxt(txt, delimiter=",", quotechar="''")
+
+
+def test_comment_multichar_error_with_quote():
+ txt = StringIO("1,2\n3,4")
+ msg = (
+ "when multiple comments or a multi-character comment is given, "
+ "quotes are not supported."
+ )
+ with pytest.raises(ValueError, match=msg):
+ np.loadtxt(txt, delimiter=",", comments="123", quotechar='"')
+ with pytest.raises(ValueError, match=msg):
+ np.loadtxt(txt, delimiter=",", comments=["#", "%"], quotechar='"')
+
+ # A single character string in a tuple is unpacked though:
+ res = np.loadtxt(txt, delimiter=",", comments=("#",), quotechar="'")
+ assert_equal(res, [[1, 2], [3, 4]])
+
+
+def test_structured_dtype_with_quotes():
+ data = StringIO(
+ (
+ "1000;2.4;'alpha';-34\n"
+ "2000;3.1;'beta';29\n"
+ "3500;9.9;'gamma';120\n"
+ "4090;8.1;'delta';0\n"
+ "5001;4.4;'epsilon';-99\n"
+ "6543;7.8;'omega';-1\n"
+ )
+ )
+ dtype = np.dtype(
+ [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)]
+ )
+ expected = np.array(
+ [
+ (1000, 2.4, "alpha", -34),
+ (2000, 3.1, "beta", 29),
+ (3500, 9.9, "gamma", 120),
+ (4090, 8.1, "delta", 0),
+ (5001, 4.4, "epsilon", -99),
+ (6543, 7.8, "omega", -1)
+ ],
+ dtype=dtype
+ )
+ res = np.loadtxt(data, dtype=dtype, delimiter=";", quotechar="'")
+ assert_array_equal(res, expected)
+
+
+def test_quoted_field_is_not_empty():
+ txt = StringIO('1\n\n"4"\n""')
+ expected = np.array(["1", "4", ""], dtype="U1")
+ res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"')
+ assert_equal(res, expected)
+
+def test_quoted_field_is_not_empty_nonstrict():
+ # Same as test_quoted_field_is_not_empty but check that we are not strict
+ # about missing closing quote (this is the `csv.reader` default also)
+ txt = StringIO('1\n\n"4"\n"')
+ expected = np.array(["1", "4", ""], dtype="U1")
+ res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"')
+ assert_equal(res, expected)
+
+def test_consecutive_quotechar_escaped():
+ txt = StringIO('"Hello, my name is ""Monty""!"')
+ expected = np.array('Hello, my name is "Monty"!', dtype="U40")
+ res = np.loadtxt(txt, dtype="U40", delimiter=",", quotechar='"')
+ assert_equal(res, expected)
+
+
+@pytest.mark.parametrize("data", ("", "\n\n\n", "# 1 2 3\n# 4 5 6\n"))
+@pytest.mark.parametrize("ndmin", (0, 1, 2))
+@pytest.mark.parametrize("usecols", [None, (1, 2, 3)])
+def test_warn_on_no_data(data, ndmin, usecols):
+ """Check that a UserWarning is emitted when no data is read from input."""
+ if usecols is not None:
+ expected_shape = (0, 3)
+ elif ndmin == 2:
+ expected_shape = (0, 1) # guess a single column?!
+ else:
+ expected_shape = (0,)
+
+ txt = StringIO(data)
+ with pytest.warns(UserWarning, match="input contained no data"):
+ res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols)
+ assert res.shape == expected_shape
+
+ with NamedTemporaryFile(mode="w") as fh:
+ fh.write(data)
+ fh.seek(0)
+ with pytest.warns(UserWarning, match="input contained no data"):
+ res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols)
+ assert res.shape == expected_shape
+
+@pytest.mark.parametrize("skiprows", (2, 3))
+def test_warn_on_skipped_data(skiprows):
+ data = "1 2 3\n4 5 6"
+ txt = StringIO(data)
+ with pytest.warns(UserWarning, match="input contained no data"):
+ np.loadtxt(txt, skiprows=skiprows)
+
+
+@pytest.mark.parametrize(["dtype", "value"], [
+ ("i2", 0x0001), ("u2", 0x0001),
+ ("i4", 0x00010203), ("u4", 0x00010203),
+ ("i8", 0x0001020304050607), ("u8", 0x0001020304050607),
+ # The following values are constructed to lead to unique bytes:
+ ("float16", 3.07e-05),
+ ("float32", 9.2557e-41), ("complex64", 9.2557e-41+2.8622554e-29j),
+ ("float64", -1.758571353180402e-24),
+ # Here and below, the repr side-steps a small loss of precision in
+ # complex `str` in PyPy (which is probably fine, as repr works):
+ ("complex128", repr(5.406409232372729e-29-1.758571353180402e-24j)),
+ # Use integer values that fit into double. Everything else leads to
+ # problems due to longdoubles going via double and decimal strings
+ # causing rounding errors.
+ ("longdouble", 0x01020304050607),
+ ("clongdouble", repr(0x01020304050607 + (0x00121314151617 * 1j))),
+ ("U2", "\U00010203\U000a0b0c")])
+@pytest.mark.parametrize("swap", [True, False])
+def test_byteswapping_and_unaligned(dtype, value, swap):
+ # Try to create "interesting" values within the valid unicode range:
+ dtype = np.dtype(dtype)
+ data = [f"x,{value}\n"] # repr as PyPy `str` truncates some
+ if swap:
+ dtype = dtype.newbyteorder()
+ full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False)
+ # The above ensures that the interesting "b" field is unaligned:
+ assert full_dt.fields["b"][1] == 1
+ res = np.loadtxt(data, dtype=full_dt, delimiter=",", encoding=None,
+ max_rows=1) # max-rows prevents over-allocation
+ assert res["b"] == dtype.type(value)
+
+
+@pytest.mark.parametrize("dtype",
+ np.typecodes["AllInteger"] + "efdFD" + "?")
+def test_unicode_whitespace_stripping(dtype):
+ # Test that all numeric types (and bool) strip whitespace correctly
+ # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted.
+ # Currently, skip float128 as it did not always support this and has no
+ # "custom" parsing:
+ txt = StringIO(' 3 ,"\u202F2\n"')
+ res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"')
+ assert_array_equal(res, np.array([3, 2]).astype(dtype))
+
+
+@pytest.mark.parametrize("dtype", "FD")
+def test_unicode_whitespace_stripping_complex(dtype):
+ # Complex has a few extra cases since it has two components and
+ # parentheses
+ line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n"
+ data = [line, line.replace(" ", "\u202F")]
+ res = np.loadtxt(data, dtype=dtype, delimiter=',')
+ assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2))
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+@pytest.mark.parametrize("dtype", "FD")
+@pytest.mark.parametrize("field",
+ ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"])
+def test_bad_complex(dtype, field):
+ with pytest.raises(ValueError):
+ np.loadtxt([field + "\n"], dtype=dtype, delimiter=",")
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+@pytest.mark.parametrize("dtype",
+ np.typecodes["AllInteger"] + "efgdFDG" + "?")
+def test_nul_character_error(dtype):
+ # Test that a \0 character is correctly recognized as an error even if
+ # what comes before is valid (not everything gets parsed internally).
+ if dtype.lower() == "g":
+ pytest.xfail("longdouble/clongdouble assignment may misbehave.")
+ with pytest.raises(ValueError):
+ np.loadtxt(["1\000"], dtype=dtype, delimiter=",", quotechar='"')
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+@pytest.mark.parametrize("dtype",
+ np.typecodes["AllInteger"] + "efgdFDG" + "?")
+def test_no_thousands_support(dtype):
+ # Mainly to document behaviour, Python supports thousands like 1_1.
+ # (e and G may end up using different conversion and support it, this is
+ # a bug but happens...)
+ if dtype == "e":
+ pytest.skip("half assignment currently uses Python float converter")
+ if dtype in "eG":
+ pytest.xfail("clongdouble assignment is buggy (uses `complex`?).")
+
+ assert int("1_1") == float("1_1") == complex("1_1") == 11
+ with pytest.raises(ValueError):
+ np.loadtxt(["1_1\n"], dtype=dtype)
+
+
+@pytest.mark.parametrize("data", [
+ ["1,2\n", "2\n,3\n"],
+ ["1,2\n", "2\r,3\n"]])
+def test_bad_newline_in_iterator(data):
+ # In NumPy <=1.22 this was accepted, because newlines were completely
+ # ignored when the input was an iterable. This could be changed, but right
+ # now, we raise an error.
+ msg = "Found an unquoted embedded newline within a single line"
+ with pytest.raises(ValueError, match=msg):
+ np.loadtxt(data, delimiter=",")
+
+
+@pytest.mark.parametrize("data", [
+ ["1,2\n", "2,3\r\n"], # a universal newline
+ ["1,2\n", "'2\n',3\n"], # a quoted newline
+ ["1,2\n", "'2\r',3\n"],
+ ["1,2\n", "'2\r\n',3\n"],
+])
+def test_good_newline_in_iterator(data):
+ # The quoted newlines will be untransformed here, but are just whitespace.
+ res = np.loadtxt(data, delimiter=",", quotechar="'")
+ assert_array_equal(res, [[1., 2.], [2., 3.]])
+
+
+@pytest.mark.parametrize("newline", ["\n", "\r", "\r\n"])
+def test_universal_newlines_quoted(newline):
+ # Check that universal newline support within the tokenizer is not applied
+ # to quoted fields. (note that lines must end in newline or quoted
+ # fields will not include a newline at all)
+ data = ['1,"2\n"\n', '3,"4\n', '1"\n']
+ data = [row.replace("\n", newline) for row in data]
+ res = np.loadtxt(data, dtype=object, delimiter=",", quotechar='"')
+ assert_array_equal(res, [['1', f'2{newline}'], ['3', f'4{newline}1']])
+
+
+def test_null_character():
+ # Basic tests to check that the NUL character is not special:
+ res = np.loadtxt(["1\0002\0003\n", "4\0005\0006"], delimiter="\000")
+ assert_array_equal(res, [[1, 2, 3], [4, 5, 6]])
+
+ # Also not as part of a field (avoid unicode/arrays as unicode strips \0)
+ res = np.loadtxt(["1\000,2\000,3\n", "4\000,5\000,6"],
+ delimiter=",", dtype=object)
+ assert res.tolist() == [["1\000", "2\000", "3"], ["4\000", "5\000", "6"]]
+
+
+def test_iterator_fails_getting_next_line():
+ class BadSequence:
+ def __len__(self):
+ return 100
+
+ def __getitem__(self, item):
+ if item == 50:
+ raise RuntimeError("Bad things happened!")
+ return f"{item}, {item+1}"
+
+ with pytest.raises(RuntimeError, match="Bad things happened!"):
+ np.loadtxt(BadSequence(), dtype=int, delimiter=",")
+
+
+class TestCReaderUnitTests:
+ # These are internal tests for path that should not be possible to hit
+ # unless things go very very wrong somewhere.
+ def test_not_an_filelike(self):
+ with pytest.raises(AttributeError, match=".*read"):
+ np.core._multiarray_umath._load_from_filelike(
+ object(), dtype=np.dtype("i"), filelike=True)
+
+ def test_filelike_read_fails(self):
+ # Can only be reached if loadtxt opens the file, so it is hard to do
+ # via the public interface (although maybe not impossible considering
+ # the current "DataClass" backing).
+ class BadFileLike:
+ counter = 0
+
+ def read(self, size):
+ self.counter += 1
+ if self.counter > 20:
+ raise RuntimeError("Bad bad bad!")
+ return "1,2,3\n"
+
+ with pytest.raises(RuntimeError, match="Bad bad bad!"):
+ np.core._multiarray_umath._load_from_filelike(
+ BadFileLike(), dtype=np.dtype("i"), filelike=True)
+
+ def test_filelike_bad_read(self):
+ # Can only be reached if loadtxt opens the file, so it is hard to do
+ # via the public interface (although maybe not impossible considering
+ # the current "DataClass" backing).
+
+ class BadFileLike:
+ counter = 0
+
+ def read(self, size):
+ return 1234 # not a string!
+
+ with pytest.raises(TypeError,
+ match="non-string returned while reading data"):
+ np.core._multiarray_umath._load_from_filelike(
+ BadFileLike(), dtype=np.dtype("i"), filelike=True)
+
+ def test_not_an_iter(self):
+ with pytest.raises(TypeError,
+ match="error reading from object, expected an iterable"):
+ np.core._multiarray_umath._load_from_filelike(
+ object(), dtype=np.dtype("i"), filelike=False)
+
+ def test_bad_type(self):
+ with pytest.raises(TypeError, match="internal error: dtype must"):
+ np.core._multiarray_umath._load_from_filelike(
+ object(), dtype="i", filelike=False)
+
+ def test_bad_encoding(self):
+ with pytest.raises(TypeError, match="encoding must be a unicode"):
+ np.core._multiarray_umath._load_from_filelike(
+ object(), dtype=np.dtype("i"), filelike=False, encoding=123)
+
+ @pytest.mark.parametrize("newline", ["\r", "\n", "\r\n"])
+ def test_manual_universal_newlines(self, newline):
+ # This is currently not available to users, because we should always
+ # open files with universal newlines enabled `newlines=None`.
+ # (And reading from an iterator uses slightly different code paths.)
+ # We have no real support for `newline="\r"` or `newline="\n" as the
+ # user cannot specify those options.
+ data = StringIO('0\n1\n"2\n"\n3\n4 #\n'.replace("\n", newline),
+ newline="")
+
+ res = np.core._multiarray_umath._load_from_filelike(
+ data, dtype=np.dtype("U10"), filelike=True,
+ quote='"', comment="#", skiplines=1)
+ assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "])
+
+
+def test_delimiter_comment_collision_raises():
+ with pytest.raises(TypeError, match=".*control characters.*incompatible"):
+ np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=",")
+
+
+def test_delimiter_quotechar_collision_raises():
+ with pytest.raises(TypeError, match=".*control characters.*incompatible"):
+ np.loadtxt(StringIO("1, 2, 3"), delimiter=",", quotechar=",")
+
+
+def test_comment_quotechar_collision_raises():
+ with pytest.raises(TypeError, match=".*control characters.*incompatible"):
+ np.loadtxt(StringIO("1 2 3"), comments="#", quotechar="#")
+
+
+def test_delimiter_and_multiple_comments_collision_raises():
+ with pytest.raises(
+ TypeError, match="Comment characters.*cannot include the delimiter"
+ ):
+ np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=["#", ","])
+
+
+@pytest.mark.parametrize(
+ "ws",
+ (
+ " ", # space
+ "\t", # tab
+ "\u2003", # em
+ "\u00A0", # non-break
+ "\u3000", # ideographic space
+ )
+)
+def test_collision_with_default_delimiter_raises(ws):
+ with pytest.raises(TypeError, match=".*control characters.*incompatible"):
+ np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), comments=ws)
+ with pytest.raises(TypeError, match=".*control characters.*incompatible"):
+ np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), quotechar=ws)
+
+
+@pytest.mark.parametrize("nl", ("\n", "\r"))
+def test_control_character_newline_raises(nl):
+ txt = StringIO(f"1{nl}2{nl}3{nl}{nl}4{nl}5{nl}6{nl}{nl}")
+ msg = "control character.*cannot be a newline"
+ with pytest.raises(TypeError, match=msg):
+ np.loadtxt(txt, delimiter=nl)
+ with pytest.raises(TypeError, match=msg):
+ np.loadtxt(txt, comments=nl)
+ with pytest.raises(TypeError, match=msg):
+ np.loadtxt(txt, quotechar=nl)
+
+
+@pytest.mark.parametrize(
+ ("generic_data", "long_datum", "unitless_dtype", "expected_dtype"),
+ [
+ ("2012-03", "2013-01-15", "M8", "M8[D]"), # Datetimes
+ ("spam-a-lot", "tis_but_a_scratch", "U", "U17"), # str
+ ],
+)
+@pytest.mark.parametrize("nrows", (10, 50000, 60000)) # lt, eq, gt chunksize
+def test_parametric_unit_discovery(
+ generic_data, long_datum, unitless_dtype, expected_dtype, nrows
+):
+ """Check that the correct unit (e.g. month, day, second) is discovered from
+ the data when a user specifies a unitless datetime."""
+ # Unit should be "D" (days) due to last entry
+ data = [generic_data] * 50000 + [long_datum]
+ expected = np.array(data, dtype=expected_dtype)
+
+ # file-like path
+ txt = StringIO("\n".join(data))
+ a = np.loadtxt(txt, dtype=unitless_dtype)
+ assert a.dtype == expected.dtype
+ assert_equal(a, expected)
+
+ # file-obj path
+ fd, fname = mkstemp()
+ with open(fname, "w") as fh:
+ fh.write("\n".join(data))
+ a = np.loadtxt(fname, dtype=unitless_dtype)
+ assert a.dtype == expected.dtype
+ assert_equal(a, expected)
+
+
+def test_str_dtype_unit_discovery_with_converter():
+ data = ["spam-a-lot"] * 60000 + ["XXXtis_but_a_scratch"]
+ expected = np.array(
+ ["spam-a-lot"] * 60000 + ["tis_but_a_scratch"], dtype="U17"
+ )
+ conv = lambda s: s.strip("XXX")
+
+ # file-like path
+ txt = StringIO("\n".join(data))
+ a = np.loadtxt(txt, dtype="U", converters=conv, encoding=None)
+ assert a.dtype == expected.dtype
+ assert_equal(a, expected)
+
+ # file-obj path
+ fd, fname = mkstemp()
+ with open(fname, "w") as fh:
+ fh.write("\n".join(data))
+ a = np.loadtxt(fname, dtype="U", converters=conv, encoding=None)
+ assert a.dtype == expected.dtype
+ assert_equal(a, expected)
+
+
+@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8),
+ reason="PyPy bug in error formatting")
+def test_control_character_empty():
+ with pytest.raises(TypeError, match="Text reading control character must"):
+ np.loadtxt(StringIO("1 2 3"), delimiter="")
+ with pytest.raises(TypeError, match="Text reading control character must"):
+ np.loadtxt(StringIO("1 2 3"), quotechar="")
+ with pytest.raises(ValueError, match="comments cannot be an empty string"):
+ np.loadtxt(StringIO("1 2 3"), comments="")
+ with pytest.raises(ValueError, match="comments cannot be an empty string"):
+ np.loadtxt(StringIO("1 2 3"), comments=["#", ""])
+
+
+def test_control_characters_as_bytes():
+ """Byte control characters (comments, delimiter) are supported."""
+ a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",")
+ assert_equal(a, [1, 2, 3])
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index 2c71e45bd..80a6fdd10 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -810,7 +810,7 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True, header='',
'Mismatched elements: {} / {} ({:.3g}%)'.format(
n_mismatch, n_elements, percent_mismatch)]
- with errstate(invalid='ignore', divide='ignore'):
+ with errstate(all='ignore'):
# ignore errors for non-numeric types
with contextlib.suppress(TypeError):
error = abs(x - y)
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 31d2cdc76..919ca751f 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -207,6 +207,14 @@ class TestArrayEqual(_GenericTest):
self._test_not_equal(a, b)
self._test_not_equal(b, a)
+ def test_suppress_overflow_warnings(self):
+ # Based on issue #18992
+ with pytest.raises(AssertionError):
+ with np.errstate(all="raise"):
+ np.testing.assert_array_equal(
+ np.array([1, 2, 3], np.float32),
+ np.array([1, 1e-40, 3], np.float32))
+
class TestBuildErrorMessage: