diff options
Diffstat (limited to 'numpy')
34 files changed, 4999 insertions, 347 deletions
diff --git a/numpy/__init__.py b/numpy/__init__.py index abe53fe9a..46d80fb76 100644 --- a/numpy/__init__.py +++ b/numpy/__init__.py @@ -52,8 +52,6 @@ of numpy are available under the ``doc`` sub-module:: Available subpackages --------------------- -doc - Topical documentation on broadcasting, indexing, etc. lib Basic functions used by several sub-packages. random @@ -66,8 +64,6 @@ polynomial Polynomial tools testing NumPy testing tools -f2py - Fortran to Python Interface Generator. distutils Enhancements to distutils with support for Fortran compilers support and more. diff --git a/numpy/core/setup.py b/numpy/core/setup.py index a13480907..9704cff0a 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -869,6 +869,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'typeinfo.h'), join('src', 'multiarray', 'usertypes.h'), join('src', 'multiarray', 'vdot.h'), + join('src', 'multiarray', 'textreading', 'readtext.h'), join('include', 'numpy', 'arrayobject.h'), join('include', 'numpy', '_neighborhood_iterator_imp.h'), join('include', 'numpy', 'npy_endian.h'), @@ -947,6 +948,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'usertypes.c'), join('src', 'multiarray', 'vdot.c'), join('src', 'common', 'npy_sort.h.src'), + join('src', 'npysort', 'x86-qsort.dispatch.c.src'), join('src', 'npysort', 'quicksort.c.src'), join('src', 'npysort', 'mergesort.cpp'), join('src', 'npysort', 'timsort.cpp'), @@ -956,6 +958,14 @@ def configuration(parent_package='',top_path=None): join('src', 'npysort', 'selection.cpp'), join('src', 'common', 'npy_binsearch.h'), join('src', 'npysort', 'binsearch.cpp'), + join('src', 'multiarray', 'textreading', 'conversions.c'), + join('src', 'multiarray', 'textreading', 'field_types.c'), + join('src', 'multiarray', 'textreading', 'growth.c'), + join('src', 'multiarray', 'textreading', 'readtext.c'), + join('src', 'multiarray', 'textreading', 'rows.c'), + join('src', 'multiarray', 'textreading', 'stream_pyobject.c'), + join('src', 'multiarray', 'textreading', 'str_to_int.c'), + join('src', 'multiarray', 'textreading', 'tokenize.c.src'), ] ####################################################################### diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index f8632e701..93e9d9d45 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -371,7 +371,79 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) #define npyv_sum_u64 _mm512_reduce_add_epi64 #define npyv_sum_f32 _mm512_reduce_add_ps #define npyv_sum_f64 _mm512_reduce_add_pd + #define npyv_reducemin_u32 _mm512_reduce_min_epu32 + #define npyv_reducemin_s32 _mm512_reduce_min_epi32 + #define npyv_reducemin_f32 _mm512_reduce_min_ps + #define npyv_reducemax_u32 _mm512_reduce_max_epu32 + #define npyv_reducemax_s32 _mm512_reduce_max_epi32 + #define npyv_reducemax_f32 _mm512_reduce_max_ps #else + NPY_FINLINE npy_uint32 npyv_reducemax_u32(npyv_u32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_u32 a1 = _mm512_max_epu32(a, _mm512_permutex2var_epi32(a, idx1, a)); + npyv_u32 a2 = _mm512_max_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); + npyv_u32 a3 = _mm512_max_epu32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_u32 a4 = _mm512_max_epu32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); + } + + NPY_FINLINE npy_int32 npyv_reducemax_s32(npyv_s32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_s32 a1 = _mm512_max_epi32(a, _mm512_permutex2var_epi32(a, idx1, a)); + npyv_s32 a2 = _mm512_max_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); + npyv_s32 a3 = _mm512_max_epi32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_s32 a4 = _mm512_max_epi32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); + } + + NPY_FINLINE npy_float npyv_reducemax_f32(npyv_f32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_f32 a1 = _mm512_max_ps(a, _mm512_permutex2var_ps(a, idx1, a)); + npyv_f32 a2 = _mm512_max_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1)); + npyv_f32 a3 = _mm512_max_ps(a2, _mm512_shuffle_ps(a2, a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_f32 a4 = _mm512_max_ps(a3, _mm512_shuffle_sp(a3, a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00)); + } + + NPY_FINLINE npy_uint32 npyv_reducemin_u32(npyv_u32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_u32 a1 = _mm512_min_epu32(a, _mm512_permutex2var_epi32(a, idx1, a)); + npyv_u32 a2 = _mm512_min_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); + npyv_u32 a3 = _mm512_min_epu32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_u32 a4 = _mm512_min_epu32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); + } + + NPY_FINLINE npy_int32 npyv_reducemin_s32(npyv_s32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_s32 a1 = _mm512_min_epi32(a, _mm512_permutex2var_epi32(a, idx1, a)); + npyv_s32 a2 = _mm512_min_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); + npyv_s32 a3 = _mm512_min_epi32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_s32 a4 = _mm512_min_epi32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); + } + + NPY_FINLINE npy_float npyv_reducemin_f32(npyv_f32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_f32 a1 = _mm512_min_ps(a, _mm512_permutex2var_ps(a, idx1, a)); + npyv_f32 a2 = _mm512_min_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1)); + npyv_f32 a3 = _mm512_min_ps(a2, _mm512_shuffle_ps(a2, a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_f32 a4 = _mm512_min_ps(a3, _mm512_shuffle_sp(a3, a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00)); + } + NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { __m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a)); diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c index a1de580d9..e4eb4f49e 100644 --- a/numpy/core/src/multiarray/conversion_utils.c +++ b/numpy/core/src/multiarray/conversion_utils.c @@ -993,6 +993,17 @@ PyArray_PyIntAsIntp(PyObject *o) } +NPY_NO_EXPORT int +PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val) +{ + *val = PyArray_PyIntAsIntp(o); + if (error_converting(*val)) { + return NPY_FAIL; + } + return NPY_SUCCEED; +} + + /* * PyArray_IntpFromIndexSequence * Returns the number of dimensions or -1 if an error occurred. diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h index 4072841ee..4d0fbb894 100644 --- a/numpy/core/src/multiarray/conversion_utils.h +++ b/numpy/core/src/multiarray/conversion_utils.h @@ -7,6 +7,9 @@ NPY_NO_EXPORT int PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq); NPY_NO_EXPORT int +PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val); + +NPY_NO_EXPORT int PyArray_OptionalIntpConverter(PyObject *obj, PyArray_Dims *seq); typedef enum { diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 789446d0c..a7b6898e1 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -69,6 +69,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0; #include "get_attr_string.h" #include "experimental_public_dtype_api.h" /* _get_experimental_dtype_api */ +#include "textreading/readtext.h" /* _readtext_from_file_object */ #include "npy_dlpack.h" @@ -4456,6 +4457,8 @@ static struct PyMethodDef array_module_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api, METH_O, NULL}, + {"_load_from_filelike", (PyCFunction)_load_from_filelike, + METH_FASTCALL | METH_KEYWORDS, NULL}, /* from umath */ {"frompyfunc", (PyCFunction) ufunc_frompyfunc, diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c new file mode 100644 index 000000000..11f4210f7 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -0,0 +1,395 @@ + +#include <Python.h> + +#include <string.h> +#include <stdlib.h> +#include <stdbool.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "lowlevel_strided_loops.h" + +#include "conversions.h" +#include "str_to_int.h" + +#include "array_coercion.h" + + +/* + * Coercion to boolean is done via integer right now. + */ +NPY_NO_EXPORT int +to_bool(PyArray_Descr *NPY_UNUSED(descr), + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + int64_t res; + if (str_to_int64(str, end, INT64_MIN, INT64_MAX, &res) < 0) { + return -1; + } + *dataptr = (char)(res != 0); + return 0; +} + + +/* + * In order to not pack a whole copy of a floating point parser, we copy the + * result into ascii and call the Python one. Float parsing isn't super quick + * so this is not terrible, but avoiding it would speed up things. + * + * Also note that parsing the first float of a complex will copy the whole + * string to ascii rather than just the first part. + * TODO: A tweak of the break might be a simple mitigation there. + * + * @param str The UCS4 string to parse + * @param end Pointer to the end of the string + * @param skip_trailing_whitespace If false does not skip trailing whitespace + * (used by the complex parser). + * @param result Output stored as double value. + */ +static NPY_INLINE int +double_from_ucs4( + const Py_UCS4 *str, const Py_UCS4 *end, + bool strip_whitespace, double *result, const Py_UCS4 **p_end) +{ + /* skip leading whitespace */ + if (strip_whitespace) { + while (Py_UNICODE_ISSPACE(*str)) { + str++; + } + } + if (str == end) { + return -1; /* empty or only whitespace: not a floating point number */ + } + + /* We convert to ASCII for the Python parser, use stack if small: */ + char stack_buf[128]; + char *heap_buf = NULL; + char *ascii = stack_buf; + + size_t str_len = end - str + 1; + if (str_len > 128) { + heap_buf = PyMem_MALLOC(str_len); + if (heap_buf == NULL) { + PyErr_NoMemory(); + return -1; + } + ascii = heap_buf; + } + char *c = ascii; + for (; str < end; str++, c++) { + if (NPY_UNLIKELY(*str >= 128)) { + /* Character cannot be used, ignore for end calculation and stop */ + end = str; + break; + } + *c = (char)(*str); + } + *c = '\0'; + + char *end_parsed; + *result = PyOS_string_to_double(ascii, &end_parsed, NULL); + /* Rewind `end` to the first UCS4 character not parsed: */ + end = end - (c - end_parsed); + + PyMem_FREE(heap_buf); + + if (*result == -1. && PyErr_Occurred()) { + return -1; + } + + if (strip_whitespace) { + /* and then skip any remainig whitespace: */ + while (Py_UNICODE_ISSPACE(*end)) { + end++; + } + } + *p_end = end; + return 0; +} + + +NPY_NO_EXPORT int +to_float(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + double double_val; + const Py_UCS4 *p_end; + if (double_from_ucs4(str, end, true, &double_val, &p_end) < 0) { + return -1; + } + if (p_end != end) { + return -1; + } + + float val = (float)double_val; + memcpy(dataptr, &val, sizeof(float)); + if (!PyArray_ISNBO(descr->byteorder)) { + npy_bswap4_unaligned(dataptr); + } + return 0; +} + + +NPY_NO_EXPORT int +to_double(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + double val; + const Py_UCS4 *p_end; + if (double_from_ucs4(str, end, true, &val, &p_end) < 0) { + return -1; + } + if (p_end != end) { + return -1; + } + + memcpy(dataptr, &val, sizeof(double)); + if (!PyArray_ISNBO(descr->byteorder)) { + npy_bswap8_unaligned(dataptr); + } + return 0; +} + + +static bool +to_complex_int( + const Py_UCS4 *item, const Py_UCS4 *token_end, + double *p_real, double *p_imag, + Py_UCS4 imaginary_unit, bool allow_parens) +{ + const Py_UCS4 *p_end; + bool unmatched_opening_paren = false; + + /* Remove whitespace before the possibly leading '(' */ + while (Py_UNICODE_ISSPACE(*item)) { + ++item; + } + if (allow_parens && (*item == '(')) { + unmatched_opening_paren = true; + ++item; + /* Allow whitespace within the parentheses: "( 1j)" */ + while (Py_UNICODE_ISSPACE(*item)) { + ++item; + } + } + if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) { + return false; + } + if (p_end == token_end) { + // No imaginary part in the string (e.g. "3.5") + *p_imag = 0.0; + return !unmatched_opening_paren; + } + if (*p_end == imaginary_unit) { + /* Only an imaginary part (e.g "1.5j") */ + *p_imag = *p_real; + *p_real = 0.0; + ++p_end; + } + else if (*p_end == '+' || *p_end == '-') { + /* Imaginary part still to parse */ + if (*p_end == '+') { + ++p_end; /* Advance to support +- (and ++) */ + } + if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) { + return false; + } + if (*p_end != imaginary_unit) { + return false; + } + ++p_end; + } + else { + *p_imag = 0; + } + + if (unmatched_opening_paren) { + /* Allow whitespace inside brackets as in "(1+2j )" or "( 1j )" */ + while (Py_UNICODE_ISSPACE(*p_end)) { + ++p_end; + } + if (*p_end == ')') { + ++p_end; + } + else { + /* parentheses was not closed */ + return false; + } + } + + while (Py_UNICODE_ISSPACE(*p_end)) { + ++p_end; + } + return p_end == token_end; +} + + +NPY_NO_EXPORT int +to_cfloat(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig) +{ + double real; + double imag; + + bool success = to_complex_int( + str, end, &real, &imag, + pconfig->imaginary_unit, true); + + if (!success) { + return -1; + } + npy_complex64 val = {(float)real, (float)imag}; + memcpy(dataptr, &val, sizeof(npy_complex64)); + if (!PyArray_ISNBO(descr->byteorder)) { + npy_bswap4_unaligned(dataptr); + npy_bswap4_unaligned(dataptr + 4); + } + return 0; +} + + +NPY_NO_EXPORT int +to_cdouble(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig) +{ + double real; + double imag; + + bool success = to_complex_int( + str, end, &real, &imag, pconfig->imaginary_unit, true); + + if (!success) { + return -1; + } + npy_complex128 val = {real, imag}; + memcpy(dataptr, &val, sizeof(npy_complex128)); + if (!PyArray_ISNBO(descr->byteorder)) { + npy_bswap8_unaligned(dataptr); + npy_bswap8_unaligned(dataptr + 8); + } + return 0; +} + + +/* + * String and unicode conversion functions. + */ +NPY_NO_EXPORT int +to_string(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(unused)) +{ + const Py_UCS4* c = str; + size_t length = descr->elsize; + + for (size_t i = 0; i < length; i++) { + if (c < end) { + /* + * loadtxt assumed latin1, which is compatible with UCS1 (first + * 256 unicode characters). + */ + if (NPY_UNLIKELY(*c > 255)) { + /* TODO: Was UnicodeDecodeError, is unspecific error good? */ + return -1; + } + dataptr[i] = (Py_UCS1)(*c); + c++; + } + else { + dataptr[i] = '\0'; + } + } + return 0; +} + + +NPY_NO_EXPORT int +to_unicode(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(unused)) +{ + int length = descr->elsize / 4; + + if (length <= end - str) { + memcpy(dataptr, str, length * 4); + } + else { + size_t given_len = end - str; + memcpy(dataptr, str, given_len * 4); + memset(dataptr + given_len * 4, '\0', (length - given_len) * 4); + } + + if (!PyArray_ISNBO(descr->byteorder)) { + for (int i = 0; i < length; i++) { + npy_bswap4_unaligned(dataptr); + dataptr += 4; + } + } + return 0; +} + + + +/* + * Convert functions helper for the generic converter. + */ +static PyObject * +call_converter_function( + PyObject *func, const Py_UCS4 *str, size_t length, bool byte_converters) +{ + PyObject *s = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str, length); + if (s == NULL) { + return s; + } + if (byte_converters) { + Py_SETREF(s, PyUnicode_AsEncodedString(s, "latin1", NULL)); + if (s == NULL) { + return NULL; + } + } + if (func == NULL) { + return s; + } + PyObject *result = PyObject_CallFunctionObjArgs(func, s, NULL); + Py_DECREF(s); + return result; +} + + +NPY_NO_EXPORT int +to_generic_with_converter(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *config, PyObject *func) +{ + bool use_byte_converter; + if (func == NULL) { + use_byte_converter = config->c_byte_converters; + } + else { + use_byte_converter = config->python_byte_converters; + } + /* Converts to unicode and calls custom converter (if set) */ + PyObject *converted = call_converter_function( + func, str, (size_t)(end - str), use_byte_converter); + if (converted == NULL) { + return -1; + } + + int res = PyArray_Pack(descr, dataptr, converted); + Py_DECREF(converted); + return res; +} + + +NPY_NO_EXPORT int +to_generic(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *config) +{ + return to_generic_with_converter(descr, str, end, dataptr, config, NULL); +} diff --git a/numpy/core/src/multiarray/textreading/conversions.h b/numpy/core/src/multiarray/textreading/conversions.h new file mode 100644 index 000000000..222eea4e7 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/conversions.h @@ -0,0 +1,57 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ + +#include <stdbool.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" + +#include "textreading/parser_config.h" + +NPY_NO_EXPORT int +to_bool(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_float(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_double(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_cfloat(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_cdouble(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_string(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused); + +NPY_NO_EXPORT int +to_unicode(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused); + +NPY_NO_EXPORT int +to_generic_with_converter(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused, PyObject *func); + +NPY_NO_EXPORT int +to_generic(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ */ diff --git a/numpy/core/src/multiarray/textreading/field_types.c b/numpy/core/src/multiarray/textreading/field_types.c new file mode 100644 index 000000000..0722efd57 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/field_types.c @@ -0,0 +1,201 @@ +#include "field_types.h" +#include "conversions.h" +#include "str_to_int.h" + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" +#include "alloc.h" + +#include "textreading/growth.h" + + +NPY_NO_EXPORT void +field_types_xclear(int num_field_types, field_type *ft) { + assert(num_field_types >= 0); + if (ft == NULL) { + return; + } + for (int i = 0; i < num_field_types; i++) { + Py_XDECREF(ft[i].descr); + ft[i].descr = NULL; + } + PyMem_Free(ft); +} + + +/* + * Fetch custom converters for the builtin NumPy DTypes (or the generic one). + * Structured DTypes get unpacked and `object` uses the generic method. + * + * TODO: This should probably be moved on the DType object in some form, + * to allow user DTypes to define their own converters. + */ +static set_from_ucs4_function * +get_from_ucs4_function(PyArray_Descr *descr) +{ + if (descr->type_num == NPY_BOOL) { + return &to_bool; + } + else if (PyDataType_ISSIGNED(descr)) { + switch (descr->elsize) { + case 1: + return &to_int8; + case 2: + return &to_int16; + case 4: + return &to_int32; + case 8: + return &to_int64; + default: + assert(0); + } + } + else if (PyDataType_ISUNSIGNED(descr)) { + switch (descr->elsize) { + case 1: + return &to_uint8; + case 2: + return &to_uint16; + case 4: + return &to_uint32; + case 8: + return &to_uint64; + default: + assert(0); + } + } + else if (descr->type_num == NPY_FLOAT) { + return &to_float; + } + else if (descr->type_num == NPY_DOUBLE) { + return &to_double; + } + else if (descr->type_num == NPY_CFLOAT) { + return &to_cfloat; + } + else if (descr->type_num == NPY_CDOUBLE) { + return &to_cdouble; + } + else if (descr->type_num == NPY_STRING) { + return &to_string; + } + else if (descr->type_num == NPY_UNICODE) { + return &to_unicode; + } + return &to_generic; +} + + +/* + * Note that the function cleans up `ft` on error. If `num_field_types < 0` + * cleanup has already happened in the internal call. + */ +static npy_intp +field_type_grow_recursive(PyArray_Descr *descr, + npy_intp num_field_types, field_type **ft, npy_intp *ft_size, + npy_intp field_offset) +{ + if (PyDataType_HASSUBARRAY(descr)) { + PyArray_Dims shape = {NULL, -1}; + + if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) { + PyErr_SetString(PyExc_ValueError, "invalid subarray shape"); + field_types_xclear(num_field_types, *ft); + return -1; + } + npy_intp size = PyArray_MultiplyList(shape.ptr, shape.len); + npy_free_cache_dim_obj(shape); + for (npy_intp i = 0; i < size; i++) { + num_field_types = field_type_grow_recursive(descr->subarray->base, + num_field_types, ft, ft_size, field_offset); + field_offset += descr->subarray->base->elsize; + if (num_field_types < 0) { + return -1; + } + } + return num_field_types; + } + else if (PyDataType_HASFIELDS(descr)) { + npy_int num_descr_fields = PyTuple_Size(descr->names); + if (num_descr_fields < 0) { + field_types_xclear(num_field_types, *ft); + return -1; + } + for (npy_intp i = 0; i < num_descr_fields; i++) { + PyObject *key = PyTuple_GET_ITEM(descr->names, i); + PyObject *tup = PyObject_GetItem(descr->fields, key); + if (tup == NULL) { + field_types_xclear(num_field_types, *ft); + return -1; + } + PyArray_Descr *field_descr; + PyObject *title; + int offset; + if (!PyArg_ParseTuple(tup, "Oi|O", &field_descr, &offset, &title)) { + Py_DECREF(tup); + field_types_xclear(num_field_types, *ft); + return -1; + } + Py_DECREF(tup); + num_field_types = field_type_grow_recursive( + field_descr, num_field_types, ft, ft_size, + field_offset + offset); + if (num_field_types < 0) { + return -1; + } + } + return num_field_types; + } + + if (*ft_size <= num_field_types) { + npy_intp alloc_size = grow_size_and_multiply( + ft_size, 4, sizeof(field_type)); + if (alloc_size < 0) { + field_types_xclear(num_field_types, *ft); + return -1; + } + field_type *new_ft = PyMem_Realloc(*ft, alloc_size); + if (new_ft == NULL) { + field_types_xclear(num_field_types, *ft); + return -1; + } + *ft = new_ft; + } + + Py_INCREF(descr); + (*ft)[num_field_types].descr = descr; + (*ft)[num_field_types].set_from_ucs4 = get_from_ucs4_function(descr); + (*ft)[num_field_types].structured_offset = field_offset; + + return num_field_types + 1; +} + + +/* + * Prepare the "field_types" for the given dtypes/descriptors. Currently, + * we copy the itemsize, but the main thing is that we check for custom + * converters. + */ +NPY_NO_EXPORT npy_intp +field_types_create(PyArray_Descr *descr, field_type **ft) +{ + if (descr->subarray != NULL) { + /* + * This could probably be allowed, but NumPy absorbs the dimensions + * so it is an awkward corner case that probably never really worked. + */ + PyErr_SetString(PyExc_TypeError, + "file reader does not support subarray dtypes. You can" + "put the dtype into a structured one using " + "`np.dtype(('name', dtype))` to avoid this limitation."); + return -1; + } + + npy_intp ft_size = 4; + *ft = PyMem_Malloc(ft_size * sizeof(field_type)); + if (*ft == NULL) { + return -1; + } + return field_type_grow_recursive(descr, 0, ft, &ft_size, 0); +} diff --git a/numpy/core/src/multiarray/textreading/field_types.h b/numpy/core/src/multiarray/textreading/field_types.h new file mode 100644 index 000000000..f26e00a5e --- /dev/null +++ b/numpy/core/src/multiarray/textreading/field_types.h @@ -0,0 +1,67 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ + +#include <stdint.h> +#include <stdbool.h> +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/parser_config.h" + +/** + * Function defining the conversion for each value. + * + * This function must support unaligned memory access. As of now, there is + * no special error handling (in whatever form): We assume that it is always + * reasonable to raise a `ValueError` noting the string that failed to be + * converted. + * + * NOTE: An earlier version of the code had unused default values (pandas + * does this) when columns are missing. We could define this either + * by passing `NULL` in, or by adding a default explicitly somewhere. + * (I think users should probably have to define the default, at which + * point it doesn't matter here.) + * + * NOTE: We are currently passing the parser config, this could be made public + * or could be set up to be dtype specific/private. Always passing + * pconfig fully seems easier right now even if it may change. + * (A future use-case may for example be user-specified strings that are + * considered boolean True or False). + * + * TODO: Aside from nailing down the above notes, it may be nice to expose + * these function publically. This could allow user DTypes to provide + * a converter or custom converters written in C rather than Python. + * + * @param descr The NumPy descriptor of the field (may be byte-swapped, etc.) + * @param str Pointer to the beginning of the UCS4 string to be parsed. + * @param end Pointer to the end of the UCS4 string. This value is currently + * guaranteed to be `\0`, ensuring that parsers can rely on + * nul-termination. + * @param dataptr The pointer where to store the parsed value + * @param pconfig Additional configuration for the parser. + * @returns 0 on success and -1 on failure. If the return value is -1 an + * error may or may not be set. If an error is set, it is chained + * behind the generic ValueError. + */ +typedef int (set_from_ucs4_function)( + PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, + char *dataptr, parser_config *pconfig); + +typedef struct _field_type { + set_from_ucs4_function *set_from_ucs4; + /* The original NumPy descriptor */ + PyArray_Descr *descr; + /* Offset to this entry within row. */ + npy_intp structured_offset; +} field_type; + + +NPY_NO_EXPORT void +field_types_xclear(int num_field_types, field_type *ft); + +NPY_NO_EXPORT npy_intp +field_types_create(PyArray_Descr *descr, field_type **ft); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ */ diff --git a/numpy/core/src/multiarray/textreading/growth.c b/numpy/core/src/multiarray/textreading/growth.c new file mode 100644 index 000000000..49a09d572 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/growth.c @@ -0,0 +1,47 @@ +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "templ_common.h" + +/* + * Helper function taking the size input and growing it (based on min_grow). + * The current scheme is a minimum growth and a general growth by 25% + * overallocation. This is then capped at 2**20 elements, as that propels us + * in the range of large page sizes (so it is presumably more than enough). + * + * It further multiplies it with `itemsize` and ensures that all results fit + * into an `npy_intp`. + * Returns -1 if any overflow occurred or the result would not fit. + * The user has to ensure the input is ssize_t but not negative. + */ +NPY_NO_EXPORT npy_intp +grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize) { + /* min_grow must be a power of two: */ + assert((min_grow & (min_grow - 1)) == 0); + npy_uintp new_size = (npy_uintp)*size; + npy_intp growth = *size >> 2; + if (growth <= min_grow) { + /* can never lead to overflow if we are using min_growth */ + new_size += min_grow; + } + else { + if (growth > 1 << 20) { + /* limit growth to order of MiB (even hugepages are not larger) */ + growth = 1 << 20; + } + new_size += growth + min_grow - 1; + new_size &= ~min_grow; + + if (new_size > NPY_MAX_INTP) { + return -1; + } + } + *size = (npy_intp)new_size; + npy_intp alloc_size; + if (npy_mul_with_overflow_intp(&alloc_size, (npy_intp)new_size, itemsize)) { + return -1; + } + return alloc_size; +} + diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h new file mode 100644 index 000000000..237b77ad3 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/growth.h @@ -0,0 +1,7 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ + +NPY_NO_EXPORT npy_intp +grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ */ diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h new file mode 100644 index 000000000..00e911667 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/parser_config.h @@ -0,0 +1,61 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ + +#include <stdbool.h> + +typedef struct { + /* + * Field delimiter character. + * Typically ',', ' ', '\t', ignored if `delimiter_is_whitespace` is true. + */ + Py_UCS4 delimiter; + + /* + * Character used to quote fields. + * Typically '"' or "'". To disable quoting we set this to UINT_MAX + * (which is not a valid unicode character and thus cannot occur in the + * file; the same is used for all other characters if necessary). + */ + Py_UCS4 quote; + + /* + * Character(s) that indicates the start of a comment. + * Typically '#', '%' or ';'. + * When encountered in a line and not inside quotes, all character + * from the comment character(s) to the end of the line are ignored. + */ + Py_UCS4 comment; + + /* + * Ignore whitespace at the beginning of a field (outside/before quotes). + * Is (and must be) set if `delimiter_is_whitespace`. + */ + bool ignore_leading_whitespace; + + /* + * If true, the delimiter is ignored and any unicode whitespace is used + * for splitting (same as `string.split()` in Python). In that case + * `ignore_leading_whitespace` should also be set. + */ + bool delimiter_is_whitespace; + + /* + * The imaginary unit character. Default is `j`. + */ + Py_UCS4 imaginary_unit; + + /* + * Data should be encoded as `latin1` when using python converter + * (implementing `loadtxt` default Python 2 compatibility mode). + * The c byte converter is used when the user requested `dtype="S"`. + * In this case we go via `dtype=object`, however, loadtxt allows latin1 + * while normal object to string casts only accept ASCII, so it ensures + * that that the object array already contains bytes and not strings. + */ + bool python_byte_converters; + bool c_byte_converters; +} parser_config; + + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ */ diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c new file mode 100644 index 000000000..7af5ee891 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -0,0 +1,312 @@ +#include <stdio.h> +#include <stdbool.h> + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" +#include "npy_argparse.h" +#include "common.h" +#include "conversion_utils.h" + +#include "textreading/parser_config.h" +#include "textreading/stream_pyobject.h" +#include "textreading/field_types.h" +#include "textreading/rows.h" +#include "textreading/str_to_int.h" + + +// +// `usecols` must point to a Python object that is Py_None or a 1-d contiguous +// numpy array with data type int32. +// +// `dtype` must point to a Python object that is Py_None or a numpy dtype +// instance. If the latter, code and sizes must be arrays of length +// num_dtype_fields, holding the flattened data field type codes and byte +// sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype, +// but we do that in Python code.) +// +// If both `usecols` and `dtype` are not None, and the data type is compound, +// then len(usecols) must equal num_dtype_fields. +// +// If `dtype` is given and it is compound, and `usecols` is None, then the +// number of columns in the file must match the number of fields in `dtype`. +// +static PyObject * +_readtext_from_stream(stream *s, + parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[], + Py_ssize_t skiplines, Py_ssize_t max_rows, + PyObject *converters, PyObject *dtype) +{ + PyArrayObject *arr = NULL; + PyArray_Descr *out_dtype = NULL; + field_type *ft = NULL; + + /* + * If dtypes[0] is dtype the input was not structured and the result + * is considered "homogeneous" and we have to discover the number of + * columns/ + */ + out_dtype = (PyArray_Descr *)dtype; + Py_INCREF(out_dtype); + + Py_ssize_t num_fields = field_types_create(out_dtype, &ft); + if (num_fields < 0) { + goto finish; + } + bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype; + + if (!homogeneous && usecols != NULL && num_usecols != num_fields) { + PyErr_Format(PyExc_TypeError, + "If a structured dtype is used, the number of columns in " + "`usecols` must match the effective number of fields. " + "But %zd usecols were given and the number of fields is %zd.", + num_usecols, num_fields); + goto finish; + } + + arr = read_rows( + s, max_rows, num_fields, ft, pc, + num_usecols, usecols, skiplines, converters, + NULL, out_dtype, homogeneous); + if (arr == NULL) { + goto finish; + } + + finish: + Py_XDECREF(out_dtype); + field_types_xclear(num_fields, ft); + return (PyObject *)arr; +} + + +static int +parse_control_character(PyObject *obj, Py_UCS4 *character) +{ + if (obj == Py_None) { + *character = (Py_UCS4)-1; /* character beyond unicode range */ + return 1; + } + if (!PyUnicode_Check(obj) || PyUnicode_GetLength(obj) != 1) { + PyErr_Format(PyExc_TypeError, + "Text reading control character must be a single unicode " + "character or None; but got: %.100R", obj); + return 0; + } + *character = PyUnicode_READ_CHAR(obj, 0); + return 1; +} + + +/* + * A (somewhat verbose) check that none of the control characters match or are + * newline. Most of these combinations are completely fine, just weird or + * surprising. + * (I.e. there is an implicit priority for control characters, so if a comment + * matches a delimiter, it would just be a comment.) + * In theory some `delimiter=None` paths could have a "meaning", but let us + * assume that users are better of setting one of the control chars to `None` + * for clarity. + * + * This also checks that the control characters cannot be newlines. + */ +static int +error_if_matching_control_characters( + Py_UCS4 delimiter, Py_UCS4 quote, Py_UCS4 comment) +{ + char *control_char1; + char *control_char2 = NULL; + if (comment != (Py_UCS4)-1) { + control_char1 = "comment"; + if (comment == '\r' || comment == '\n') { + goto error; + } + else if (comment == quote) { + control_char2 = "quotechar"; + goto error; + } + else if (comment == delimiter) { + control_char2 = "delimiter"; + goto error; + } + } + if (quote != (Py_UCS4)-1) { + control_char1 = "quotechar"; + if (quote == '\r' || quote == '\n') { + goto error; + } + else if (quote == delimiter) { + control_char2 = "delimiter"; + goto error; + } + } + if (delimiter != (Py_UCS4)-1) { + control_char1 = "delimiter"; + if (delimiter == '\r' || delimiter == '\n') { + goto error; + } + } + /* The above doesn't work with delimiter=None, which means "whitespace" */ + if (delimiter == (Py_UCS4)-1) { + control_char1 = "delimiter"; + if (Py_UNICODE_ISSPACE(comment)) { + control_char2 = "comment"; + goto error; + } + else if (Py_UNICODE_ISSPACE(quote)) { + control_char2 = "quotechar"; + goto error; + } + } + return 0; + + error: + if (control_char2 != NULL) { + PyErr_Format(PyExc_TypeError, + "The values for control characters '%s' and '%s' are " + "incompatible", + control_char1, control_char2); + } + else { + PyErr_Format(PyExc_TypeError, + "control character '%s' cannot be a newline (`\\r` or `\\n`).", + control_char1, control_char2); + } + return -1; +} + + +NPY_NO_EXPORT PyObject * +_load_from_filelike(PyObject *NPY_UNUSED(mod), + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) +{ + PyObject *file; + Py_ssize_t skiplines = 0; + Py_ssize_t max_rows = -1; + PyObject *usecols_obj = Py_None; + PyObject *converters = Py_None; + + PyObject *dtype = Py_None; + PyObject *encoding_obj = Py_None; + const char *encoding = NULL; + + parser_config pc = { + .delimiter = ',', + .comment = '#', + .quote = '"', + .imaginary_unit = 'j', + .delimiter_is_whitespace = false, + .ignore_leading_whitespace = false, + .python_byte_converters = false, + .c_byte_converters = false, + }; + bool filelike = true; + + PyObject *arr = NULL; + + NPY_PREPARE_ARGPARSER; + if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames, + "file", NULL, &file, + "|delimiter", &parse_control_character, &pc.delimiter, + "|comment", &parse_control_character, &pc.comment, + "|quote", &parse_control_character, &pc.quote, + "|imaginary_unit", &parse_control_character, &pc.imaginary_unit, + "|usecols", NULL, &usecols_obj, + "|skiplines", &PyArray_IntpFromPyIntConverter, &skiplines, + "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows, + "|converters", NULL, &converters, + "|dtype", NULL, &dtype, + "|encoding", NULL, &encoding_obj, + "|filelike", &PyArray_BoolConverter, &filelike, + "|byte_converters", &PyArray_BoolConverter, &pc.python_byte_converters, + "|c_byte_converters", PyArray_BoolConverter, &pc.c_byte_converters, + NULL, NULL, NULL) < 0) { + return NULL; + } + + /* Reject matching control characters, they just rarely make sense anyway */ + if (error_if_matching_control_characters( + pc.delimiter, pc.quote, pc.comment) < 0) { + return NULL; + } + + if (pc.delimiter == (Py_UCS4)-1) { + pc.delimiter_is_whitespace = true; + /* Ignore leading whitespace to match `string.split(None)` */ + pc.ignore_leading_whitespace = true; + } + + if (!PyArray_DescrCheck(dtype) ) { + PyErr_SetString(PyExc_TypeError, + "internal error: dtype must be provided and be a NumPy dtype"); + return NULL; + } + + if (encoding_obj != Py_None) { + if (!PyUnicode_Check(encoding_obj)) { + PyErr_SetString(PyExc_TypeError, + "encoding must be a unicode string."); + return NULL; + } + encoding = PyUnicode_AsUTF8(encoding_obj); + if (encoding == NULL) { + return NULL; + } + } + + /* + * Parse usecols, the rest of NumPy has no clear helper for this, so do + * it here manually. + */ + Py_ssize_t num_usecols = -1; + Py_ssize_t *usecols = NULL; + if (usecols_obj != Py_None) { + num_usecols = PySequence_Length(usecols_obj); + if (num_usecols < 0) { + return NULL; + } + /* Calloc just to not worry about overflow */ + usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t)); + for (Py_ssize_t i = 0; i < num_usecols; i++) { + PyObject *tmp = PySequence_GetItem(usecols_obj, i); + if (tmp == NULL) { + PyMem_FREE(usecols); + return NULL; + } + usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); + if (error_converting(usecols[i])) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Format(PyExc_TypeError, + "usecols must be an int or a sequence of ints but " + "it contains at least one element of type '%s'", + Py_TYPE(tmp)->tp_name); + } + Py_DECREF(tmp); + PyMem_FREE(usecols); + return NULL; + } + Py_DECREF(tmp); + } + } + + stream *s; + if (filelike) { + s = stream_python_file(file, encoding); + } + else { + s = stream_python_iterable(file, encoding); + } + if (s == NULL) { + PyMem_FREE(usecols); + return NULL; + } + + arr = _readtext_from_stream( + s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype); + stream_close(s); + PyMem_FREE(usecols); + return arr; +} + diff --git a/numpy/core/src/multiarray/textreading/readtext.h b/numpy/core/src/multiarray/textreading/readtext.h new file mode 100644 index 000000000..5cf48c555 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/readtext.h @@ -0,0 +1,7 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ + +NPY_NO_EXPORT PyObject * +_load_from_filelike(PyObject *self, PyObject *args, PyObject *kwargs); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c new file mode 100644 index 000000000..e30ff835e --- /dev/null +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -0,0 +1,481 @@ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" +#include "numpy/npy_3kcompat.h" +#include "alloc.h" + +#include <string.h> +#include <stdbool.h> + +#include "textreading/stream.h" +#include "textreading/tokenize.h" +#include "textreading/conversions.h" +#include "textreading/field_types.h" +#include "textreading/rows.h" +#include "textreading/growth.h" + +/* + * Minimum size to grow the allcoation by (or 25%). The 8KiB means the actual + * growths is within `8 KiB <= size < 16 KiB` (depending on the row size). + */ +#define MIN_BLOCK_SIZE (1 << 13) + + + +/* + * Create the array of converter functions from the Python converters. + */ +static PyObject ** +create_conv_funcs( + PyObject *converters, Py_ssize_t num_fields, const Py_ssize_t *usecols) +{ + assert(converters != Py_None); + + PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *)); + if (conv_funcs == NULL) { + PyErr_NoMemory(); + return NULL; + } + + if (PyCallable_Check(converters)) { + /* a single converter used for all columns individually */ + for (Py_ssize_t i = 0; i < num_fields; i++) { + Py_INCREF(converters); + conv_funcs[i] = converters; + } + return conv_funcs; + } + else if (!PyDict_Check(converters)) { + PyErr_SetString(PyExc_TypeError, + "converters must be a dictionary mapping columns to converter " + "functions or a single callable."); + goto error; + } + + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(converters, &pos, &key, &value)) { + Py_ssize_t column = PyNumber_AsSsize_t(key, PyExc_IndexError); + if (column == -1 && PyErr_Occurred()) { + PyErr_Format(PyExc_TypeError, + "keys of the converters dictionary must be integers; " + "got %.100R", key); + goto error; + } + if (usecols != NULL) { + /* + * This code searches for the corresponding usecol. It is + * identical to the legacy usecols code, which has two weaknesses: + * 1. It fails for duplicated usecols only setting converter for + * the first one. + * 2. It fails e.g. if usecols uses negative indexing and + * converters does not. (This is a feature, since it allows + * us to correctly normalize converters to result column here.) + */ + Py_ssize_t i = 0; + for (; i < num_fields; i++) { + if (column == usecols[i]) { + column = i; + break; + } + } + if (i == num_fields) { + continue; /* ignore unused converter */ + } + } + else { + if (column < -num_fields || column >= num_fields) { + PyErr_Format(PyExc_ValueError, + "converter specified for column %zd, which is invalid " + "for the number of fields %d.", column, num_fields); + goto error; + } + if (column < 0) { + column += num_fields; + } + } + if (!PyCallable_Check(value)) { + PyErr_Format(PyExc_TypeError, + "values of the converters dictionary must be callable, " + "but the value associated with key %R is not", key); + goto error; + } + Py_INCREF(value); + conv_funcs[column] = value; + } + return conv_funcs; + + error: + for (Py_ssize_t i = 0; i < num_fields; i++) { + Py_XDECREF(conv_funcs[i]); + } + PyMem_FREE(conv_funcs); + return NULL; +} + +/** + * Read a file into the provided array, or create (and possibly grow) an + * array to read into. + * + * @param s The stream object/struct providing reading capabilities used by + * the tokenizer. + * @param max_rows The number of rows to read, or -1. If negative + * all rows are read. + * @param num_field_types The number of field types stored in `field_types`. + * @param field_types Information about the dtype for each column (or one if + * `homogeneous`). + * @param pconfig Pointer to the parser config object used by both the + * tokenizer and the conversion functions. + * @param num_usecols The number of columns in `usecols`. + * @param usecols An array of length `num_usecols` or NULL. If given indicates + * which column is read for each individual row (negative columns are + * accepted). + * @param skiplines The number of lines to skip, these lines are ignored. + * @param converters Python dictionary of converters. Finalizing converters + * is difficult without information about the number of columns. + * @param data_array An array to be filled or NULL. In either case a new + * reference is returned (the reference to `data_array` is not stolen). + * @param out_descr The dtype used for allocating a new array. This is not + * used if `data_array` is provided. Note that the actual dtype of the + * returned array can differ for strings. + * @param num_cols Pointer in which the actual (discovered) number of columns + * is returned. This is only relevant if `homogeneous` is true. + * @param homogeneous Whether the datatype of the array is not homogeneous, + * i.e. not structured. In this case the number of columns has to be + * discovered an the returned array will be 2-dimensional rather than + * 1-dimensional. + * + * @returns Returns the result as an array object or NULL on error. The result + * is always a new reference (even when `data_array` was passed in). + */ +NPY_NO_EXPORT PyArrayObject * +read_rows(stream *s, + npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types, + parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, + Py_ssize_t skiplines, PyObject *converters, + PyArrayObject *data_array, PyArray_Descr *out_descr, + bool homogeneous) +{ + char *data_ptr = NULL; + Py_ssize_t current_num_fields; + npy_intp row_size = out_descr->elsize; + PyObject **conv_funcs = NULL; + + bool needs_init = PyDataType_FLAGCHK(out_descr, NPY_NEEDS_INIT); + + int ndim = homogeneous ? 2 : 1; + npy_intp result_shape[2] = {0, 1}; + + bool data_array_allocated = data_array == NULL; + /* Make sure we own `data_array` for the purpose of error handling */ + Py_XINCREF(data_array); + size_t rows_per_block = 1; /* will be increased depending on row size */ + npy_intp data_allocated_rows = 0; + + /* We give a warning if max_rows is used and an empty line is encountered */ + bool give_empty_row_warning = max_rows >= 0; + + int ts_result = 0; + tokenizer_state ts; + if (tokenizer_init(&ts, pconfig) < 0) { + goto error; + } + + /* Set the actual number of fields if it is already known, otherwise -1 */ + Py_ssize_t actual_num_fields = -1; + if (usecols != NULL) { + assert(homogeneous || num_field_types == num_usecols); + actual_num_fields = num_usecols; + } + else if (!homogeneous) { + assert(usecols == NULL || num_field_types == num_usecols); + actual_num_fields = num_field_types; + } + + for (Py_ssize_t i = 0; i < skiplines; i++) { + ts.state = TOKENIZE_GOTO_LINE_END; + ts_result = tokenize(s, &ts, pconfig); + if (ts_result < 0) { + goto error; + } + else if (ts_result != 0) { + /* Fewer lines than skiplines is acceptable */ + break; + } + } + + Py_ssize_t row_count = 0; /* number of rows actually processed */ + while ((max_rows < 0 || row_count < max_rows) && ts_result == 0) { + ts_result = tokenize(s, &ts, pconfig); + if (ts_result < 0) { + goto error; + } + current_num_fields = ts.num_fields; + field_info *fields = ts.fields; + if (NPY_UNLIKELY(ts.num_fields == 0)) { + /* + * Deprecated NumPy 1.23, 2021-01-13 (not really a deprecation, + * but similar policy should apply to removing the warning again) + */ + /* Tokenizer may give a final "empty line" even if there is none */ + if (give_empty_row_warning && ts_result == 0) { + give_empty_row_warning = false; + if (PyErr_WarnFormat(PyExc_UserWarning, 3, + "Input line %zd contained no data and will not be " + "counted towards `max_rows=%zd`. This differs from " + "the behaviour in NumPy <=1.22 which counted lines " + "rather than rows. If desired, the previous behaviour " + "can be achieved by using `itertools.islice`.\n" + "Please see the 1.23 release notes for an example on " + "how to do this. If you wish to ignore this warning, " + "use `warnings.filterwarnings`. This warning is " + "expected to be removed in the future and is given " + "only once per `loadtxt` call.", + row_count + skiplines + 1, max_rows) < 0) { + goto error; + } + } + continue; /* Ignore empty line */ + } + + if (NPY_UNLIKELY(data_ptr == NULL)) { + // We've deferred some of the initialization tasks to here, + // because we've now read the first line, and we definitively + // know how many fields (i.e. columns) we will be processing. + if (actual_num_fields == -1) { + actual_num_fields = current_num_fields; + } + + if (converters != Py_None) { + conv_funcs = create_conv_funcs( + converters, actual_num_fields, usecols); + if (conv_funcs == NULL) { + goto error; + } + } + + /* Note that result_shape[1] is only used if homogeneous is true */ + result_shape[1] = actual_num_fields; + if (homogeneous) { + row_size *= actual_num_fields; + } + + if (data_array == NULL) { + if (max_rows < 0) { + /* + * Negative max_rows denotes to read the whole file, we + * approach this by allocating ever larger blocks. + * Adds a number of rows based on `MIN_BLOCK_SIZE`. + * Note: later code grows assuming this is a power of two. + */ + if (row_size == 0) { + /* actual rows_per_block should not matter here */ + rows_per_block = 512; + } + else { + /* safe on overflow since min_rows will be 0 or 1 */ + size_t min_rows = ( + (MIN_BLOCK_SIZE + row_size - 1) / row_size); + while (rows_per_block < min_rows) { + rows_per_block *= 2; + } + } + data_allocated_rows = rows_per_block; + } + else { + data_allocated_rows = max_rows; + } + result_shape[0] = data_allocated_rows; + Py_INCREF(out_descr); + /* + * We do not use Empty, as it would fill with None + * and requiring decref'ing if we shrink again. + */ + data_array = (PyArrayObject *)PyArray_SimpleNewFromDescr( + ndim, result_shape, out_descr); +#ifdef NPY_RELAXED_STRIDES_DEBUG + /* Incompatible with NPY_RELAXED_STRIDES_DEBUG due to growing */ + if (result_shape[0] == 1) { + PyArray_STRIDES(data_array)[0] = row_size; + } +#endif /* NPY_RELAXED_STRIDES_DEBUG */ + if (data_array == NULL) { + goto error; + } + if (needs_init) { + memset(PyArray_BYTES(data_array), 0, PyArray_NBYTES(data_array)); + } + } + else { + assert(max_rows >=0); + data_allocated_rows = max_rows; + } + data_ptr = PyArray_BYTES(data_array); + } + + if (!usecols && (actual_num_fields != current_num_fields)) { + PyErr_Format(PyExc_ValueError, + "the number of columns changed from %d to %d at row %zu; " + "use `usecols` to select a subset and avoid this error", + actual_num_fields, current_num_fields, row_count+1); + goto error; + } + + if (NPY_UNLIKELY(data_allocated_rows == row_count)) { + /* + * Grow by ~25% and rounded up to the next rows_per_block + * NOTE: This is based on very crude timings and could be refined! + */ + npy_intp new_rows = data_allocated_rows; + npy_intp alloc_size = grow_size_and_multiply( + &new_rows, rows_per_block, row_size); + if (alloc_size < 0) { + /* should normally error much earlier, but make sure */ + PyErr_SetString(PyExc_ValueError, + "array is too big. Cannot read file as a single array; " + "providing a maximum number of rows to read may help."); + goto error; + } + + char *new_data = PyDataMem_UserRENEW( + PyArray_BYTES(data_array), alloc_size ? alloc_size : 1, + PyArray_HANDLER(data_array)); + if (new_data == NULL) { + PyErr_NoMemory(); + goto error; + } + /* Replace the arrays data since it may have changed */ + ((PyArrayObject_fields *)data_array)->data = new_data; + ((PyArrayObject_fields *)data_array)->dimensions[0] = new_rows; + data_ptr = new_data + row_count * row_size; + data_allocated_rows = new_rows; + if (needs_init) { + memset(data_ptr, '\0', (new_rows - row_count) * row_size); + } + } + + for (Py_ssize_t i = 0; i < actual_num_fields; ++i) { + Py_ssize_t f; /* The field, either 0 (if homogeneous) or i. */ + Py_ssize_t col; /* The column as read, remapped by usecols */ + char *item_ptr; + if (homogeneous) { + f = 0; + item_ptr = data_ptr + i * field_types[0].descr->elsize; + } + else { + f = i; + item_ptr = data_ptr + field_types[f].structured_offset; + } + + if (usecols == NULL) { + col = i; + } + else { + col = usecols[i]; + if (col < 0) { + // Python-like column indexing: k = -1 means the last column. + col += current_num_fields; + } + if (NPY_UNLIKELY((col < 0) || (col >= current_num_fields))) { + PyErr_Format(PyExc_ValueError, + "invalid column index %d at row %zu with %d " + "columns", + usecols[i], current_num_fields, row_count+1); + goto error; + } + } + + /* + * The following function calls represent the main "conversion" + * step, i.e. parsing the unicode string for each field and storing + * the result in the array. + */ + int parser_res; + Py_UCS4 *str = ts.field_buffer + fields[col].offset; + Py_UCS4 *end = ts.field_buffer + fields[col + 1].offset - 1; + if (conv_funcs == NULL || conv_funcs[i] == NULL) { + parser_res = field_types[f].set_from_ucs4(field_types[f].descr, + str, end, item_ptr, pconfig); + } + else { + parser_res = to_generic_with_converter(field_types[f].descr, + str, end, item_ptr, pconfig, conv_funcs[i]); + } + + if (NPY_UNLIKELY(parser_res < 0)) { + PyObject *exc, *val, *tb; + PyErr_Fetch(&exc, &val, &tb); + + size_t length = end - str; + PyObject *string = PyUnicode_FromKindAndData( + PyUnicode_4BYTE_KIND, str, length); + if (string == NULL) { + npy_PyErr_ChainExceptions(exc, val, tb); + goto error; + } + PyErr_Format(PyExc_ValueError, + "could not convert string %.100R to %S at " + "row %zu, column %d.", + string, field_types[f].descr, row_count, col+1); + Py_DECREF(string); + npy_PyErr_ChainExceptionsCause(exc, val, tb); + goto error; + } + } + + ++row_count; + data_ptr += row_size; + } + + tokenizer_clear(&ts); + PyMem_FREE(conv_funcs); + + if (data_array == NULL) { + assert(row_count == 0 && result_shape[0] == 0); + if (actual_num_fields == -1) { + /* + * We found no rows and have to discover the number of elements + * we have no choice but to guess 1. + * NOTE: It may make sense to move this outside of here to refine + * the behaviour where necessary. + */ + result_shape[1] = 1; + } + else { + result_shape[1] = actual_num_fields; + } + Py_INCREF(out_descr); + data_array = (PyArrayObject *)PyArray_Empty( + ndim, result_shape, out_descr, 0); + } + + /* + * Note that if there is no data, `data_array` may still be NULL and + * row_count is 0. In that case, always realloc just in case. + */ + if (data_array_allocated && data_allocated_rows != row_count) { + size_t size = row_count * row_size; + char *new_data = PyDataMem_UserRENEW( + PyArray_BYTES(data_array), size ? size : 1, + PyArray_HANDLER(data_array)); + if (new_data == NULL) { + Py_DECREF(data_array); + PyErr_NoMemory(); + return NULL; + } + ((PyArrayObject_fields *)data_array)->data = new_data; + ((PyArrayObject_fields *)data_array)->dimensions[0] = row_count; + } + + return data_array; + + error: + PyMem_FREE(conv_funcs); + tokenizer_clear(&ts); + Py_XDECREF(data_array); + return NULL; +} diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h new file mode 100644 index 000000000..20eb9e186 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/rows.h @@ -0,0 +1,22 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> +#include <stdio.h> + +#include "textreading/stream.h" +#include "textreading/field_types.h" +#include "textreading/parser_config.h" + + +NPY_NO_EXPORT PyArrayObject * +read_rows(stream *s, + npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types, + parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, + Py_ssize_t skiplines, PyObject *converters, + PyArrayObject *data_array, PyArray_Descr *out_descr, + bool homogeneous); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ */ diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c new file mode 100644 index 000000000..11b03e31c --- /dev/null +++ b/numpy/core/src/multiarray/textreading/str_to_int.c @@ -0,0 +1,67 @@ + +#include <Python.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "lowlevel_strided_loops.h" + +#include <string.h> +#include "textreading/str_to_int.h" +#include "textreading/parser_config.h" + + +#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX, byteswap_unaligned) \ + NPY_NO_EXPORT int \ + to_##intw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + int64_t parsed; \ + intw##_t x; \ + \ + if (str_to_int64(str, end, INT_MIN, INT_MAX, &parsed) < 0) { \ + return -1; \ + } \ + else { \ + x = (intw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + byteswap_unaligned(dataptr); \ + } \ + return 0; \ + } + +#define DECLARE_TO_UINT(uintw, UINT_MAX, byteswap_unaligned) \ + NPY_NO_EXPORT int \ + to_##uintw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + uint64_t parsed; \ + uintw##_t x; \ + \ + if (str_to_uint64(str, end, UINT_MAX, &parsed) < 0) { \ + return -1; \ + } \ + else { \ + x = (uintw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + byteswap_unaligned(dataptr); \ + } \ + return 0; \ + } + +#define byteswap_nothing(ptr) + +DECLARE_TO_INT(int8, INT8_MIN, INT8_MAX, byteswap_nothing) +DECLARE_TO_INT(int16, INT16_MIN, INT16_MAX, npy_bswap2_unaligned) +DECLARE_TO_INT(int32, INT32_MIN, INT32_MAX, npy_bswap4_unaligned) +DECLARE_TO_INT(int64, INT64_MIN, INT64_MAX, npy_bswap8_unaligned) + +DECLARE_TO_UINT(uint8, UINT8_MAX, byteswap_nothing) +DECLARE_TO_UINT(uint16, UINT16_MAX, npy_bswap2_unaligned) +DECLARE_TO_UINT(uint32, UINT32_MAX, npy_bswap4_unaligned) +DECLARE_TO_UINT(uint64, UINT64_MAX, npy_bswap8_unaligned) diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h new file mode 100644 index 000000000..a0a89a0ef --- /dev/null +++ b/numpy/core/src/multiarray/textreading/str_to_int.h @@ -0,0 +1,174 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/parser_config.h" + + +/* + * The following two string conversion functions are largely equivalent + * in Pandas. They are in the header file here, to ensure they can be easily + * inline in the other function. + * Unlike pandas, pass in end-pointer (do not rely on \0) and return 0 or -1. + * + * The actual functions are defined using macro templating below. + */ +NPY_FINLINE int +str_to_int64( + const Py_UCS4 *p_item, const Py_UCS4 *p_end, + int64_t int_min, int64_t int_max, int64_t *result) +{ + const Py_UCS4 *p = (const Py_UCS4 *)p_item; + bool isneg = 0; + int64_t number = 0; + + // Skip leading spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + isneg = true; + ++p; + } + else if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + return -1; + } + + if (isneg) { + // If number is greater than pre_min, at least one more digit + // can be processed without overflowing. + int dig_pre_min = -(int_min % 10); + int64_t pre_min = int_min / 10; + + // Process the digits. + int d = *p; + while (isdigit(d)) { + if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { + number = number * 10 - (d - '0'); + d = *++p; + } + else { + return -1; + } + } + } + else { + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + int64_t pre_max = int_max / 10; + int dig_pre_max = int_max % 10; + + // Process the digits. + int d = *p; + while (isdigit(d)) { + if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } + else { + return -1; + } + } + } + + // Skip trailing spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Did we use up all the characters? + if (p != p_end) { + return -1; + } + + *result = number; + return 0; +} + + +NPY_FINLINE int +str_to_uint64( + const Py_UCS4 *p_item, const Py_UCS4 *p_end, + uint64_t uint_max, uint64_t *result) +{ + const Py_UCS4 *p = (const Py_UCS4 *)p_item; + uint64_t number = 0; + int d; + + // Skip leading spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + return -1; + } + if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + return -1; + } + + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + uint64_t pre_max = uint_max / 10; + int dig_pre_max = uint_max % 10; + + // Process the digits. + d = *p; + while (isdigit(d)) { + if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } + else { + return -1; + } + } + + // Skip trailing spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Did we use up all the characters? + if (p != p_end) { + return -1; + } + + *result = number; + return 0; +} + + +#define DECLARE_TO_INT_PROTOTYPE(intw) \ + NPY_NO_EXPORT int \ + to_##intw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig); + +DECLARE_TO_INT_PROTOTYPE(int8) +DECLARE_TO_INT_PROTOTYPE(int16) +DECLARE_TO_INT_PROTOTYPE(int32) +DECLARE_TO_INT_PROTOTYPE(int64) + +DECLARE_TO_INT_PROTOTYPE(uint8) +DECLARE_TO_INT_PROTOTYPE(uint16) +DECLARE_TO_INT_PROTOTYPE(uint32) +DECLARE_TO_INT_PROTOTYPE(uint64) + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h new file mode 100644 index 000000000..59bd14074 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -0,0 +1,41 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ + +#include <stdint.h> + +/* + * When getting the next line, we hope that the buffer provider can already + * give some information about the newlines, because for Python iterables + * we definitely expect to get line-by-line buffers. + * + * BUFFER_IS_FILEEND must be returned when the end of the file is reached and + * must NOT be returned together with a valid (non-empty) buffer. + */ +#define BUFFER_MAY_CONTAIN_NEWLINE 0 +#define BUFFER_IS_LINEND 1 +#define BUFFER_IS_FILEEND 2 + +/* + * Base struct for streams. We currently have two, a chunked reader for + * filelikes and a line-by-line for any iterable. + * As of writing, the chunked reader was only used for filelikes not already + * opened. That is to preserve the amount read in case of an error exactly. + * If we drop this, we could read it more often (but not when `max_rows` is + * used). + * + * The "streams" can extend this struct to store their own data (so it is + * a very lightweight "object"). + */ +typedef struct _stream { + int (*stream_nextbuf)(void *sdata, char **start, char **end, int *kind); + // Note that the first argument to stream_close is the stream pointer + // itself, not the stream_data pointer. + int (*stream_close)(struct _stream *strm); +} stream; + + +#define stream_nextbuf(s, start, end, kind) \ + ((s)->stream_nextbuf((s), start, end, kind)) +#define stream_close(s) ((s)->stream_close((s))) + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ */ diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c new file mode 100644 index 000000000..6f84ff01d --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c @@ -0,0 +1,239 @@ +/* + * C side structures to provide capabilities to read Python file like objects + * in chunks, or iterate through iterables with each result representing a + * single line of a file. + */ + +#include <stdio.h> +#include <stdlib.h> + +#define PY_SSIZE_T_CLEAN +#include <Python.h> +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" + +#include "textreading/stream.h" + +#define READ_CHUNKSIZE 1 << 14 + + +typedef struct { + stream stream; + /* The Python file object being read. */ + PyObject *file; + + /* The `read` attribute of the file object. */ + PyObject *read; + /* Amount to read each time we call `obj.read()` */ + PyObject *chunksize; + + /* Python str object holding the line most recently read from the file. */ + PyObject *chunk; + + /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ + const char *encoding; +} python_chunks_from_file; + + +/* + * Helper function to support byte objects as well as unicode strings. + * + * NOTE: Steals a reference to `str` (although usually returns it unmodified). + */ +static NPY_INLINE PyObject * +process_stringlike(PyObject *str, const char *encoding) +{ + if (PyBytes_Check(str)) { + PyObject *ustr; + ustr = PyUnicode_FromEncodedObject(str, encoding, NULL); + if (ustr == NULL) { + return NULL; + } + Py_DECREF(str); + return ustr; + } + else if (!PyUnicode_Check(str)) { + PyErr_SetString(PyExc_TypeError, + "non-string returned while reading data"); + Py_DECREF(str); + return NULL; + } + return str; +} + + +static NPY_INLINE void +buffer_info_from_unicode(PyObject *str, char **start, char **end, int *kind) +{ + Py_ssize_t length = PyUnicode_GET_LENGTH(str); + *kind = PyUnicode_KIND(str); + + if (*kind == PyUnicode_1BYTE_KIND) { + *start = (char *)PyUnicode_1BYTE_DATA(str); + } + else if (*kind == PyUnicode_2BYTE_KIND) { + *start = (char *)PyUnicode_2BYTE_DATA(str); + length *= sizeof(Py_UCS2); + } + else if (*kind == PyUnicode_4BYTE_KIND) { + *start = (char *)PyUnicode_4BYTE_DATA(str); + length *= sizeof(Py_UCS4); + } + *end = *start + length; +} + + +static int +fb_nextbuf(python_chunks_from_file *fb, char **start, char **end, int *kind) +{ + Py_XDECREF(fb->chunk); + fb->chunk = NULL; + + PyObject *chunk = PyObject_CallFunctionObjArgs(fb->read, fb->chunksize, NULL); + if (chunk == NULL) { + return -1; + } + fb->chunk = process_stringlike(chunk, fb->encoding); + if (fb->chunk == NULL) { + return -1; + } + buffer_info_from_unicode(fb->chunk, start, end, kind); + if (*start == *end) { + return BUFFER_IS_FILEEND; + } + return BUFFER_MAY_CONTAIN_NEWLINE; +} + + +static int +fb_del(stream *strm) +{ + python_chunks_from_file *fb = (python_chunks_from_file *)strm; + + Py_XDECREF(fb->file); + Py_XDECREF(fb->read); + Py_XDECREF(fb->chunksize); + Py_XDECREF(fb->chunk); + + PyMem_FREE(strm); + + return 0; +} + + +NPY_NO_EXPORT stream * +stream_python_file(PyObject *obj, const char *encoding) +{ + python_chunks_from_file *fb; + + fb = (python_chunks_from_file *)PyMem_Calloc(1, sizeof(python_chunks_from_file)); + if (fb == NULL) { + PyErr_NoMemory(); + return NULL; + } + + fb->stream.stream_nextbuf = (void *)&fb_nextbuf; + fb->stream.stream_close = &fb_del; + + fb->encoding = encoding; + Py_INCREF(obj); + fb->file = obj; + + fb->read = PyObject_GetAttrString(obj, "read"); + if (fb->read == NULL) { + goto fail; + } + fb->chunksize = PyLong_FromLong(READ_CHUNKSIZE); + if (fb->chunksize == NULL) { + goto fail; + } + + return (stream *)fb; + +fail: + fb_del((stream *)fb); + return NULL; +} + + +/* + * Stream from a Python iterable by interpreting each item as a line in a file + */ +typedef struct { + stream stream; + /* The Python file object being read. */ + PyObject *iterator; + + /* Python str object holding the line most recently fetched */ + PyObject *line; + + /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ + const char *encoding; +} python_lines_from_iterator; + + +static int +it_del(stream *strm) +{ + python_lines_from_iterator *it = (python_lines_from_iterator *)strm; + + Py_XDECREF(it->iterator); + Py_XDECREF(it->line); + + PyMem_FREE(strm); + return 0; +} + + +static int +it_nextbuf(python_lines_from_iterator *it, char **start, char **end, int *kind) +{ + Py_XDECREF(it->line); + it->line = NULL; + + PyObject *line = PyIter_Next(it->iterator); + if (line == NULL) { + if (PyErr_Occurred()) { + return -1; + } + *start = NULL; + *end = NULL; + return BUFFER_IS_FILEEND; + } + it->line = process_stringlike(line, it->encoding); + if (it->line == NULL) { + return -1; + } + + buffer_info_from_unicode(it->line, start, end, kind); + return BUFFER_IS_LINEND; +} + + +NPY_NO_EXPORT stream * +stream_python_iterable(PyObject *obj, const char *encoding) +{ + python_lines_from_iterator *it; + + if (!PyIter_Check(obj)) { + PyErr_SetString(PyExc_TypeError, + "error reading from object, expected an iterable."); + return NULL; + } + + it = (python_lines_from_iterator *)PyMem_Calloc(1, sizeof(*it)); + if (it == NULL) { + PyErr_NoMemory(); + return NULL; + } + + it->stream.stream_nextbuf = (void *)&it_nextbuf; + it->stream.stream_close = &it_del; + + it->encoding = encoding; + Py_INCREF(obj); + it->iterator = obj; + + return (stream *)it; +} diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.h b/numpy/core/src/multiarray/textreading/stream_pyobject.h new file mode 100644 index 000000000..45c11dd95 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.h @@ -0,0 +1,16 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#include "textreading/stream.h" + +NPY_NO_EXPORT stream * +stream_python_file(PyObject *obj, const char *encoding); + +NPY_NO_EXPORT stream * +stream_python_iterable(PyObject *obj, const char *encoding); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src new file mode 100644 index 000000000..6ddba3345 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -0,0 +1,457 @@ + +#include <Python.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/stream.h" +#include "textreading/tokenize.h" +#include "textreading/parser_config.h" +#include "textreading/growth.h" + + +/* + How parsing quoted fields works: + + For quoting to be activated, the first character of the field + must be the quote character (after taking into account + ignore_leading_spaces). While quoting is active, delimiters + are treated as regular characters, not delimiters. Quoting is + deactivated by the second occurrence of the quote character. An + exception is the occurrence of two consecutive quote characters, + which is treated as a literal occurrence of a single quote character. + E.g. (with delimiter=',' and quote='"'): + 12.3,"New York, NY","3'2""" + The second and third fields are `New York, NY` and `3'2"`. + + If a non-delimiter occurs after the closing quote, the quote is + ignored and parsing continues with quoting deactivated. Quotes + that occur while quoting is not activated are not handled specially; + they become part of the data. + E.g: + 12.3,"ABC"DEF,XY"Z + The second and third fields are `ABCDEF` and `XY"Z`. + + Note that the second field of + 12.3,"ABC" ,4.5 + is `ABC `. Currently there is no option to ignore whitespace + at the end of a field. +*/ + + +/**begin repeat + * #type = Py_UCS1, Py_UCS2, Py_UCS4# + */ +static NPY_INLINE int +copy_to_field_buffer_@type@(tokenizer_state *ts, + const @type@ *chunk_start, const @type@ *chunk_end) +{ + npy_intp chunk_length = chunk_end - chunk_start; + npy_intp size = chunk_length + ts->field_buffer_pos + 2; + + if (NPY_UNLIKELY(ts->field_buffer_length < size)) { + npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4)); + if (alloc_size < 0) { + PyErr_Format(PyExc_ValueError, + "line too long to handle while reading file."); + return -1; + } + Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size); + if (grown == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->field_buffer_length = size; + ts->field_buffer = grown; + } + + Py_UCS4 *write_pos = ts->field_buffer + ts->field_buffer_pos; + for (; chunk_start < chunk_end; chunk_start++, write_pos++) { + *write_pos = (Py_UCS4)*chunk_start; + } + *write_pos = '\0'; /* always ensure we end with NUL */ + ts->field_buffer_pos += chunk_length; + return 0; +} +/**end repeat**/ + + +static NPY_INLINE int +add_field(tokenizer_state *ts) +{ + /* The previous field is done, advance to keep a NUL byte at the end */ + ts->field_buffer_pos += 1; + + if (NPY_UNLIKELY(ts->num_fields + 1 > ts->fields_size)) { + npy_intp size = ts->num_fields; + + npy_intp alloc_size = grow_size_and_multiply( + &size, 4, sizeof(field_info)); + if (alloc_size < 0) { + /* Check for a size overflow, path should be almost impossible. */ + PyErr_Format(PyExc_ValueError, + "too many columns found; cannot read file."); + return -1; + } + field_info *fields = PyMem_Realloc(ts->fields, alloc_size); + if (fields == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->fields = fields; + ts->fields_size = size; + } + + ts->fields[ts->num_fields].offset = ts->field_buffer_pos; + ts->fields[ts->num_fields].quoted = false; + ts->num_fields += 1; + /* Ensure this (currently empty) word is NUL terminated. */ + ts->field_buffer[ts->field_buffer_pos] = '\0'; + return 0; +} + + +/**begin repeat + * #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND# + * #type = Py_UCS1, Py_UCS2, Py_UCS4# + */ +static NPY_INLINE int +tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) +{ + @type@ *pos = (@type@ *)ts->pos; + @type@ *stop = (@type@ *)ts->end; + @type@ *chunk_start; + + if (ts->state == TOKENIZE_CHECK_QUOTED) { + /* before we can check for quotes, strip leading whitespace */ + if (config->ignore_leading_whitespace) { + while (pos < stop && Py_UNICODE_ISSPACE(*pos) && + *pos != '\r' && *pos != '\n') { + pos++; + } + if (pos == stop) { + ts->pos = (char *)pos; + return 0; + } + } + + /* Setting chunk effectively starts the field */ + if (*pos == config->quote) { + ts->fields[ts->num_fields - 1].quoted = true; + ts->state = TOKENIZE_QUOTED; + pos++; /* TOKENIZE_QUOTED is OK with pos == stop */ + } + else { + /* Set to TOKENIZE_QUOTED or TOKENIZE_QUOTED_WHITESPACE */ + ts->state = ts->unquoted_state; + } + } + + switch (ts->state) { + case TOKENIZE_UNQUOTED: + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + else if (*pos == config->delimiter) { + ts->state = TOKENIZE_INIT; + break; + } + else if (*pos == config->comment) { + ts->state = TOKENIZE_GOTO_LINE_END; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_UNQUOTED_WHITESPACE: + /* Note, this branch is largely identical to `TOKENIZE_UNQUOTED` */ + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + else if (Py_UNICODE_ISSPACE(*pos)) { + ts->state = TOKENIZE_INIT; + break; + } + else if (*pos == config->comment) { + ts->state = TOKENIZE_GOTO_LINE_END; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_QUOTED: + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == config->quote) { + ts->state = TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE: + if (*pos == config->quote) { + /* Copy the quote character directly from the config: */ + if (copy_to_field_buffer_Py_UCS4(ts, + &config->quote, &config->quote+1) < 0) { + return -1; + } + ts->state = TOKENIZE_QUOTED; + pos++; + } + else { + /* continue parsing as if unquoted */ + ts->state = TOKENIZE_UNQUOTED; + } + break; + + case TOKENIZE_GOTO_LINE_END: + if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) { + pos = stop; /* advance to next buffer */ + ts->state = TOKENIZE_LINE_END; + break; + } + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + } + pos++; + break; + + case TOKENIZE_EAT_CRLF: + /* "Universal newline" support: remove \n in \r\n. */ + if (*pos == '\n') { + pos++; + } + ts->state = TOKENIZE_LINE_END; + break; + + default: + assert(0); + } + + ts->pos = (char *)pos; + return 0; +} +/**end repeat**/ + + +/* + * This tokenizer always copies the full "row" (all tokens). This makes + * two things easier: + * 1. It means that every word is guaranteed to be followed by a NUL character + * (although it can include one as well). + * 2. If usecols are used we can sniff the first row easier by parsing it + * fully. Further, usecols can be negative so we may not know which row we + * need up-front. + * + * The tokenizer could grow the ability to skip fields and check the + * maximum number of fields when known, it is unclear that this is worthwhile. + * + * Unlike some tokenizers, this one tries to work in chunks and copies + * data in chunks as well. The hope is that this makes multiple light-weight + * loops rather than a single heavy one, to allow e.g. quickly scanning for the + * end of a field. Copying chunks also means we usually only check once per + * field whether the buffer is large enough. + * Different choices are possible, this one seems to work well, though. + * + * The core (main part) of the tokenizer is specialized for the three Python + * unicode flavors UCS1, UCS2, and UCS4 as a worthwhile optimization. + */ +NPY_NO_EXPORT int +tokenize(stream *s, tokenizer_state *ts, parser_config *const config) +{ + assert(ts->fields_size >= 2); + assert(ts->field_buffer_length >= 2*sizeof(Py_UCS4)); + + int finished_reading_file = 0; + + /* Reset to start of buffer */ + ts->field_buffer_pos = 0; + ts->num_fields = 0; + + while (1) { + /* + * This loop adds new fields to the result (to make up a full row) + * until the row ends (typically a line end or the file end) + */ + if (ts->state == TOKENIZE_INIT) { + /* Start a new field */ + if (add_field(ts) < 0) { + return -1; + } + ts->state = TOKENIZE_CHECK_QUOTED; + } + + if (NPY_UNLIKELY(ts->pos >= ts->end)) { + if (ts->buf_state == BUFFER_IS_LINEND && + ts->state != TOKENIZE_QUOTED) { + /* + * Finished line, do not read anymore (also do not eat \n). + * If we are in a quoted field and the "line" does not end with + * a newline, the quoted field will not have it either. + * I.e. `np.loadtxt(['"a', 'b"'], dtype="S2", quotechar='"')` + * reads "ab". This matches `next(csv.reader(['"a', 'b"']))`. + */ + break; + } + /* fetch new data */ + ts->buf_state = stream_nextbuf(s, + &ts->pos, &ts->end, &ts->unicode_kind); + if (ts->buf_state < 0) { + return -1; + } + if (ts->buf_state == BUFFER_IS_FILEEND) { + finished_reading_file = 1; + ts->pos = ts->end; /* stream should ensure this. */ + break; + } + else if (ts->pos == ts->end) { + /* This must be an empty line (and it must be indicated!). */ + assert(ts->buf_state == BUFFER_IS_LINEND); + break; + } + } + int status; + if (ts->unicode_kind == PyUnicode_1BYTE_KIND) { + status = tokenizer_core_Py_UCS1(ts, config); + } + else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) { + status = tokenizer_core_Py_UCS2(ts, config); + } + else { + assert(ts->unicode_kind == PyUnicode_4BYTE_KIND); + status = tokenizer_core_Py_UCS4(ts, config); + } + if (status < 0) { + return -1; + } + + if (ts->state == TOKENIZE_LINE_END) { + break; + } + } + + /* + * We have finished tokenizing a full row into fields, finalize result + */ + if (ts->buf_state == BUFFER_IS_LINEND) { + /* This line is "finished", make sure we don't touch it again: */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; + if (NPY_UNLIKELY(ts->pos < ts->end)) { + PyErr_SetString(PyExc_ValueError, + "Found an unquoted embedded newline within a single line of " + "input. This is currently not supported."); + return -1; + } + } + + /* Finish the last field (we "append" one to store the last ones length) */ + if (add_field(ts) < 0) { + return -1; + } + ts->num_fields -= 1; + + /* + * If have one field, but that field is completely empty, this is an + * empty line, and we just ignore it. + */ + if (ts->num_fields == 1 + && ts->fields[1].offset - ts->fields[0].offset == 1 + && !ts->fields->quoted) { + ts->num_fields--; + } + ts->state = TOKENIZE_INIT; + return finished_reading_file; +} + + +NPY_NO_EXPORT void +tokenizer_clear(tokenizer_state *ts) +{ + PyMem_FREE(ts->field_buffer); + ts->field_buffer = NULL; + ts->field_buffer_length = 0; + + PyMem_FREE(ts->fields); + ts->fields = NULL; + ts->fields_size = 0; +} + + +/* + * Initialize the tokenizer. We may want to copy all important config + * variables into the tokenizer. This would improve the cache locality during + * tokenizing. + */ +NPY_NO_EXPORT int +tokenizer_init(tokenizer_state *ts, parser_config *config) +{ + /* State and buf_state could be moved into tokenize if we go by row */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; + ts->state = TOKENIZE_INIT; + if (config->delimiter_is_whitespace) { + ts->unquoted_state = TOKENIZE_UNQUOTED_WHITESPACE; + } + else { + ts->unquoted_state = TOKENIZE_UNQUOTED; + } + ts->num_fields = 0; + + ts->buf_state = 0; + ts->pos = NULL; + ts->end = NULL; + + ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4)); + if (ts->field_buffer == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->field_buffer_length = 32; + + ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields)); + if (ts->fields == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->fields_size = 4; + return 0; +} diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h new file mode 100644 index 000000000..fa10bb9b0 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -0,0 +1,78 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ + +#include <Python.h> +#include "numpy/ndarraytypes.h" + +#include "textreading/stream.h" +#include "textreading/parser_config.h" + + +typedef enum { + /* Initialization of fields */ + TOKENIZE_INIT, + TOKENIZE_CHECK_QUOTED, + /* Main field parsing states */ + TOKENIZE_UNQUOTED, + TOKENIZE_UNQUOTED_WHITESPACE, + TOKENIZE_QUOTED, + /* Handling of two character control sequences (except "\r\n") */ + TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE, + /* Line end handling */ + TOKENIZE_LINE_END, + TOKENIZE_EAT_CRLF, /* "\r\n" support (carriage return, line feed) */ + TOKENIZE_GOTO_LINE_END, +} tokenizer_parsing_state; + + +typedef struct { + size_t offset; + bool quoted; +} field_info; + + +typedef struct { + tokenizer_parsing_state state; + /* Either TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE: */ + tokenizer_parsing_state unquoted_state; + int unicode_kind; + int buf_state; + /* the buffer we are currently working on */ + char *pos; + char *end; + /* + * Space to copy words into. The buffer must always be at least two NUL + * entries longer (8 bytes) than the actual word (including initially). + * The first byte beyond the current word is always NUL'ed on write, the + * second byte is there to allow easy appending of an additional empty + * word at the end (this word is also NUL terminated). + */ + npy_intp field_buffer_length; + npy_intp field_buffer_pos; + Py_UCS4 *field_buffer; + + /* + * Fields, including information about the field being quoted. This + * always includes one "additional" empty field. The length of a field + * is equal to `fields[i+1].offset - fields[i].offset - 1`. + * + * The tokenizer assumes at least one field is allocated. + */ + npy_intp num_fields; + npy_intp fields_size; + field_info *fields; +} tokenizer_state; + + +NPY_NO_EXPORT void +tokenizer_clear(tokenizer_state *ts); + + +NPY_NO_EXPORT int +tokenizer_init(tokenizer_state *ts, parser_config *config); + +NPY_NO_EXPORT int +tokenize(stream *s, tokenizer_state *ts, parser_config *const config); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ */ diff --git a/numpy/core/src/npysort/quicksort.c.src b/numpy/core/src/npysort/quicksort.c.src index 933f75808..b4b060720 100644 --- a/numpy/core/src/npysort/quicksort.c.src +++ b/numpy/core/src/npysort/quicksort.c.src @@ -51,8 +51,14 @@ #include "npy_sort.h" #include "npysort_common.h" +#include "npy_cpu_features.h" +#include "x86-qsort.h" #include <stdlib.h> +#ifndef NPY_DISABLE_OPTIMIZATION + #include "x86-qsort.dispatch.h" +#endif + #define NOT_USED NPY_UNUSED(unused) /* * pushing largest partition has upper bound of log2(n) space @@ -83,11 +89,22 @@ * npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong, * npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat, * npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta# + * #AVX512 = 0*5, 1, 1, 0*5, 1, 0*7# */ NPY_NO_EXPORT int quicksort_@suff@(void *start, npy_intp num, void *NOT_USED) { + +#if @AVX512@ + void (*dispfunc)(void*, npy_intp) = NULL; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = &x86_quicksort_@suff@); + if (dispfunc) { + (*dispfunc)(start, num); + return 0; + } +#endif + @type@ vp; @type@ *pl = start; @type@ *pr = pl + num - 1; diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.c.src b/numpy/core/src/npysort/x86-qsort.dispatch.c.src new file mode 100644 index 000000000..b93c737cb --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort.dispatch.c.src @@ -0,0 +1,587 @@ +/*@targets + * $maxopt $keep_baseline avx512_skx + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "x86-qsort.h" +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#ifdef NPY_HAVE_AVX512_SKX +#include <immintrin.h> +#include "numpy/npy_math.h" +#include "npy_sort.h" +#include "simd/simd.h" + + +/* + * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are + * based on these two research papers: + * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types + * https://drops.dagstuhl.de/opus/volltexte/2021/13775/ + * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel Skylake + * https://arxiv.org/pdf/1704.08579.pdf + * + * High level idea: Vectorize the quicksort partitioning using AVX-512 + * compressstore instructions. The algorithm to pick the pivot is to use median of + * 72 elements picked at random. If the array size is < 128, then use + * Bitonic sorting network. Good resource for bitonic sorting network: + * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 + * + * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340 for + * potential problems when converting this code to universal intrinsics framework. + */ + +/* + * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ +#define NETWORK1 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 +#define NETWORK2 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3 +#define NETWORK3 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 +#define NETWORK4 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2 +#define NETWORK5 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +#define NETWORK6 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4 +#define NETWORK7 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8 +#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF) +#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32) +#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32) +#define SHUFFLE_MASK(a,b,c,d) (a << 6) | (b << 4) | (c << 2) | d +#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK) +#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK) + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +/* + * Vectorized random number generator xoroshiro128+. Broken into 2 parts: + * (1) vnext generates 2 64-bit random integers + * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to + * the length of the array + */ +#define VROTL(x, k) /* rotate each uint64_t value in vector */ \ + _mm256_or_si256(_mm256_slli_epi64((x),(k)),_mm256_srli_epi64((x),64-(k))) + +static NPY_INLINE +__m256i vnext(__m256i* s0, __m256i* s1) { + *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */ + *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1), + _mm256_slli_epi64(*s1, 16)); + *s1 = VROTL(*s1, 37); + return _mm256_add_epi64(*s0, *s1); /* return random vector */ +} + +/* transform random numbers to the range between 0 and bound - 1 */ +static NPY_INLINE +__m256i rnd_epu32(__m256i rnd_vec, __m256i bound) { + __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32); + __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound); + return _mm256_blend_epi32(odd, even, 0b01010101); +} + +/**begin repeat + * + * #TYPE = INT, UINT, FLOAT# + * #type = int, uint, float# + * #type_t = npy_int, npy_uint, npy_float# + * #zmm_t = __m512i, __m512i, __m512# + * #ymm_t = __m256i, __m256i, __m256# + * #vsuf1 = epi32, epu32, ps# + * #vsuf2 = epi32, epi32, ps# + * #vsuf3 = si512, si512, ps# + * #vsuf4 = s32, u32, f32# + * #CMP_GE_OP = _MM_CMPINT_NLT, _MM_CMPINT_NLT, _CMP_GE_OQ# + * #TYPE_MAX_VAL = NPY_MAX_INT32, NPY_MAX_UINT32, NPY_INFINITYF# + * #TYPE_MIN_VAL = NPY_MIN_INT32, 0, -NPY_INFINITYF# + */ + +/* + * COEX == Compare and Exchange two registers by swapping min and max values + */ +#define COEX_ZMM_@vsuf1@(a, b) { \ + @zmm_t@ temp = a; \ + a = _mm512_min_@vsuf1@(a,b); \ + b = _mm512_max_@vsuf1@(temp, b);} \ + +#define COEX_YMM_@vsuf1@(a, b){ \ + @ymm_t@ temp = a; \ + a = _mm256_min_@vsuf1@(a, b); \ + b = _mm256_max_@vsuf1@(temp, b);} \ + +static NPY_INLINE +@zmm_t@ cmp_merge_@vsuf1@(@zmm_t@ in1, @zmm_t@ in2, __mmask16 mask) +{ + @zmm_t@ min = _mm512_min_@vsuf1@(in2, in1); + @zmm_t@ max = _mm512_max_@vsuf1@(in2, in1); + return _mm512_mask_mov_@vsuf2@(min, mask, max); // 0 -> min, 1 -> max +} + +/* + * Assumes zmm is random and performs a full sorting network defined in + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg + */ +static NPY_INLINE +@zmm_t@ sort_zmm_@vsuf1@(@zmm_t@ zmm) +{ + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(0,1,2,3)), 0xCCCC); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK3),zmm), 0xF0F0); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(1,0,3,2)), 0xCCCC); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5),zmm), 0xFF00); + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK6),zmm), 0xF0F0); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(1,0,3,2)), 0xCCCC); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + return zmm; +} + +// Assumes zmm is bitonic and performs a recursive half cleaner +static NPY_INLINE +@zmm_t@ bitonic_merge_zmm_@vsuf1@(@zmm_t@ zmm) +{ + // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK7),zmm), 0xFF00); + // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc .. + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK6),zmm), 0xF0F0); + // 3) half_cleaner[4] + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(1,0,3,2)), 0xCCCC); + // 3) half_cleaner[1] + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + return zmm; +} + +// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner +static NPY_INLINE +void bitonic_merge_two_zmm_@vsuf1@(@zmm_t@* zmm1, @zmm_t@* zmm2) +{ + // 1) First step of a merging network: coex of zmm1 and zmm2 reversed + *zmm2 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), *zmm2); + @zmm_t@ zmm3 = _mm512_min_@vsuf1@(*zmm1, *zmm2); + @zmm_t@ zmm4 = _mm512_max_@vsuf1@(*zmm1, *zmm2); + // 2) Recursive half cleaner for each + *zmm1 = bitonic_merge_zmm_@vsuf1@(zmm3); + *zmm2 = bitonic_merge_zmm_@vsuf1@(zmm4); +} + +// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive half cleaner +static NPY_INLINE +void bitonic_merge_four_zmm_@vsuf1@(@zmm_t@* zmm) +{ + @zmm_t@ zmm2r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[2]); + @zmm_t@ zmm3r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[3]); + @zmm_t@ zmm_t1 = _mm512_min_@vsuf1@(zmm[0], zmm3r); + @zmm_t@ zmm_t2 = _mm512_min_@vsuf1@(zmm[1], zmm2r); + @zmm_t@ zmm_t3 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[1], zmm2r)); + @zmm_t@ zmm_t4 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[0], zmm3r)); + @zmm_t@ zmm0 = _mm512_min_@vsuf1@(zmm_t1, zmm_t2); + @zmm_t@ zmm1 = _mm512_max_@vsuf1@(zmm_t1, zmm_t2); + @zmm_t@ zmm2 = _mm512_min_@vsuf1@(zmm_t3, zmm_t4); + @zmm_t@ zmm3 = _mm512_max_@vsuf1@(zmm_t3, zmm_t4); + zmm[0] = bitonic_merge_zmm_@vsuf1@(zmm0); + zmm[1] = bitonic_merge_zmm_@vsuf1@(zmm1); + zmm[2] = bitonic_merge_zmm_@vsuf1@(zmm2); + zmm[3] = bitonic_merge_zmm_@vsuf1@(zmm3); +} + +static NPY_INLINE +void bitonic_merge_eight_zmm_@vsuf1@(@zmm_t@* zmm) +{ + @zmm_t@ zmm4r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[4]); + @zmm_t@ zmm5r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[5]); + @zmm_t@ zmm6r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[6]); + @zmm_t@ zmm7r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[7]); + @zmm_t@ zmm_t1 = _mm512_min_@vsuf1@(zmm[0], zmm7r); + @zmm_t@ zmm_t2 = _mm512_min_@vsuf1@(zmm[1], zmm6r); + @zmm_t@ zmm_t3 = _mm512_min_@vsuf1@(zmm[2], zmm5r); + @zmm_t@ zmm_t4 = _mm512_min_@vsuf1@(zmm[3], zmm4r); + @zmm_t@ zmm_t5 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[3], zmm4r)); + @zmm_t@ zmm_t6 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[2], zmm5r)); + @zmm_t@ zmm_t7 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[1], zmm6r)); + @zmm_t@ zmm_t8 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[0], zmm7r)); + COEX_ZMM_@vsuf1@(zmm_t1, zmm_t3); + COEX_ZMM_@vsuf1@(zmm_t2, zmm_t4); + COEX_ZMM_@vsuf1@(zmm_t5, zmm_t7); + COEX_ZMM_@vsuf1@(zmm_t6, zmm_t8); + COEX_ZMM_@vsuf1@(zmm_t1, zmm_t2); + COEX_ZMM_@vsuf1@(zmm_t3, zmm_t4); + COEX_ZMM_@vsuf1@(zmm_t5, zmm_t6); + COEX_ZMM_@vsuf1@(zmm_t7, zmm_t8); + zmm[0] = bitonic_merge_zmm_@vsuf1@(zmm_t1); + zmm[1] = bitonic_merge_zmm_@vsuf1@(zmm_t2); + zmm[2] = bitonic_merge_zmm_@vsuf1@(zmm_t3); + zmm[3] = bitonic_merge_zmm_@vsuf1@(zmm_t4); + zmm[4] = bitonic_merge_zmm_@vsuf1@(zmm_t5); + zmm[5] = bitonic_merge_zmm_@vsuf1@(zmm_t6); + zmm[6] = bitonic_merge_zmm_@vsuf1@(zmm_t7); + zmm[7] = bitonic_merge_zmm_@vsuf1@(zmm_t8); +} + +static NPY_INLINE +void sort_16_@vsuf1@(@type_t@* arr, npy_int N) +{ + __mmask16 load_mask = (0x0001 << N) - 0x0001; + @zmm_t@ zmm = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask, arr); + _mm512_mask_storeu_@vsuf2@(arr, load_mask, sort_zmm_@vsuf1@(zmm)); +} + +static NPY_INLINE +void sort_32_@vsuf1@(@type_t@* arr, npy_int N) +{ + if (N <= 16) { + sort_16_@vsuf1@(arr, N); + return; + } + @zmm_t@ zmm1 = _mm512_loadu_@vsuf3@(arr); + __mmask16 load_mask = (0x0001 << (N-16)) - 0x0001; + @zmm_t@ zmm2 = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask, arr + 16); + zmm1 = sort_zmm_@vsuf1@(zmm1); + zmm2 = sort_zmm_@vsuf1@(zmm2); + bitonic_merge_two_zmm_@vsuf1@(&zmm1, &zmm2); + _mm512_storeu_@vsuf3@(arr, zmm1); + _mm512_mask_storeu_@vsuf2@(arr + 16, load_mask, zmm2); +} + +static NPY_INLINE +void sort_64_@vsuf1@(@type_t@* arr, npy_int N) +{ + if (N <= 32) { + sort_32_@vsuf1@(arr, N); + return; + } + @zmm_t@ zmm[4]; + zmm[0] = _mm512_loadu_@vsuf3@(arr); + zmm[1] = _mm512_loadu_@vsuf3@(arr + 16); + __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; + if (N < 48) { + load_mask1 = (0x0001 << (N-32)) - 0x0001; + load_mask2 = 0x0000; + } + else if (N < 64) { + load_mask2 = (0x0001 << (N-48)) - 0x0001; + } + zmm[2] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask1, arr + 32); + zmm[3] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask2, arr + 48); + zmm[0] = sort_zmm_@vsuf1@(zmm[0]); + zmm[1] = sort_zmm_@vsuf1@(zmm[1]); + zmm[2] = sort_zmm_@vsuf1@(zmm[2]); + zmm[3] = sort_zmm_@vsuf1@(zmm[3]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[0], &zmm[1]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[2], &zmm[3]); + bitonic_merge_four_zmm_@vsuf1@(zmm); + _mm512_storeu_@vsuf3@(arr, zmm[0]); + _mm512_storeu_@vsuf3@(arr + 16, zmm[1]); + _mm512_mask_storeu_@vsuf2@(arr + 32, load_mask1, zmm[2]); + _mm512_mask_storeu_@vsuf2@(arr + 48, load_mask2, zmm[3]); +} + +static NPY_INLINE +void sort_128_@vsuf1@(@type_t@* arr, npy_int N) +{ + if (N <= 64) { + sort_64_@vsuf1@(arr, N); + return; + } + @zmm_t@ zmm[8]; + zmm[0] = _mm512_loadu_@vsuf3@(arr); + zmm[1] = _mm512_loadu_@vsuf3@(arr + 16); + zmm[2] = _mm512_loadu_@vsuf3@(arr + 32); + zmm[3] = _mm512_loadu_@vsuf3@(arr + 48); + zmm[0] = sort_zmm_@vsuf1@(zmm[0]); + zmm[1] = sort_zmm_@vsuf1@(zmm[1]); + zmm[2] = sort_zmm_@vsuf1@(zmm[2]); + zmm[3] = sort_zmm_@vsuf1@(zmm[3]); + __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; + __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF; + if (N < 80) { + load_mask1 = (0x0001 << (N-64)) - 0x0001; + load_mask2 = 0x0000; + load_mask3 = 0x0000; + load_mask4 = 0x0000; + } + else if (N < 96) { + load_mask2 = (0x0001 << (N-80)) - 0x0001; + load_mask3 = 0x0000; + load_mask4 = 0x0000; + } + else if (N < 112) { + load_mask3 = (0x0001 << (N-96)) - 0x0001; + load_mask4 = 0x0000; + } + else { + load_mask4 = (0x0001 << (N-112)) - 0x0001; + } + zmm[4] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask1, arr + 64); + zmm[5] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask2, arr + 80); + zmm[6] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask3, arr + 96); + zmm[7] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask4, arr + 112); + zmm[4] = sort_zmm_@vsuf1@(zmm[4]); + zmm[5] = sort_zmm_@vsuf1@(zmm[5]); + zmm[6] = sort_zmm_@vsuf1@(zmm[6]); + zmm[7] = sort_zmm_@vsuf1@(zmm[7]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[0], &zmm[1]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[2], &zmm[3]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[4], &zmm[5]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[6], &zmm[7]); + bitonic_merge_four_zmm_@vsuf1@(zmm); + bitonic_merge_four_zmm_@vsuf1@(zmm + 4); + bitonic_merge_eight_zmm_@vsuf1@(zmm); + _mm512_storeu_@vsuf3@(arr, zmm[0]); + _mm512_storeu_@vsuf3@(arr + 16, zmm[1]); + _mm512_storeu_@vsuf3@(arr + 32, zmm[2]); + _mm512_storeu_@vsuf3@(arr + 48, zmm[3]); + _mm512_mask_storeu_@vsuf2@(arr + 64, load_mask1, zmm[4]); + _mm512_mask_storeu_@vsuf2@(arr + 80, load_mask2, zmm[5]); + _mm512_mask_storeu_@vsuf2@(arr + 96, load_mask3, zmm[6]); + _mm512_mask_storeu_@vsuf2@(arr + 112, load_mask4, zmm[7]); +} + + +static NPY_INLINE +void swap_@TYPE@(@type_t@ *arr, npy_intp ii, npy_intp jj) { + @type_t@ temp = arr[ii]; + arr[ii] = arr[jj]; + arr[jj] = temp; +} + +// Median of 3 stratergy +//static NPY_INLINE +//npy_intp get_pivot_index(@type_t@ *arr, const npy_intp left, const npy_intp right) { +// return (rand() % (right + 1 - left)) + left; +// //npy_intp middle = ((right-left)/2) + left; +// //@type_t@ a = arr[left], b = arr[middle], c = arr[right]; +// //if ((b >= a && b <= c) || (b <= a && b >= c)) +// // return middle; +// //if ((a >= b && a <= c) || (a <= b && a >= c)) +// // return left; +// //else +// // return right; +//} + +/* + * Picking the pivot: Median of 72 array elements chosen at random. + */ + +static NPY_INLINE +@type_t@ get_pivot_@vsuf1@(@type_t@ *arr, const npy_intp left, const npy_intp right) { + /* seeds for vectorized random number generator */ + __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374, + 1324281658759788278, 6214952190349879213); + __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653, + 7874578921548791257, 1998265912745817298); + s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left)); + s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right)); + + npy_intp arrsize = right - left + 1; + __m256i bound = _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize); + __m512i left_vec = _mm512_set1_epi64(left); + __m512i right_vec = _mm512_set1_epi64(right); + @ymm_t@ v[9]; + /* fill 9 vectors with random numbers */ + for (npy_int i = 0; i < 9; ++i) { + __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */ + __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32(rand_64, bound)); /* random numbers between 0 and bound - 1 */ + __m512i indices; + if (i < 5) + indices = _mm512_add_epi64(left_vec, rand_32); /* indices for arr */ + else + indices = _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */ + + v[i] = _mm512_i64gather_@vsuf2@(indices, arr, sizeof(@type_t@)); + } + + /* median network for 9 elements */ + COEX_YMM_@vsuf1@(v[0], v[1]); COEX_YMM_@vsuf1@(v[2], v[3]); + COEX_YMM_@vsuf1@(v[4], v[5]); COEX_YMM_@vsuf1@(v[6], v[7]); + COEX_YMM_@vsuf1@(v[0], v[2]); COEX_YMM_@vsuf1@(v[1], v[3]); + COEX_YMM_@vsuf1@(v[4], v[6]); COEX_YMM_@vsuf1@(v[5], v[7]); + COEX_YMM_@vsuf1@(v[0], v[4]); COEX_YMM_@vsuf1@(v[1], v[2]); + COEX_YMM_@vsuf1@(v[5], v[6]); COEX_YMM_@vsuf1@(v[3], v[7]); + COEX_YMM_@vsuf1@(v[1], v[5]); COEX_YMM_@vsuf1@(v[2], v[6]); + COEX_YMM_@vsuf1@(v[3], v[5]); COEX_YMM_@vsuf1@(v[2], v[4]); + COEX_YMM_@vsuf1@(v[3], v[4]); + COEX_YMM_@vsuf1@(v[3], v[8]); + COEX_YMM_@vsuf1@(v[4], v[8]); + + // technically v[4] needs to be sorted before we pick the correct median, + // picking the 4th element works just as well for performance + @type_t@* temp = (@type_t@*) &v[4]; + + return temp[4]; +} + +/* + * Parition one ZMM register based on the pivot and returns the index of the + * last element that is less than equal to the pivot. + */ +static NPY_INLINE +npy_int partition_vec_@vsuf1@(@type_t@* arr, npy_intp left, npy_intp right, + const @zmm_t@ curr_vec, const @zmm_t@ pivot_vec, + @zmm_t@* smallest_vec, @zmm_t@* biggest_vec) +{ + /* which elements are larger than the pivot */ + __mmask16 gt_mask = _mm512_cmp_@vsuf1@_mask(curr_vec, pivot_vec, @CMP_GE_OP@); + npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask); + _mm512_mask_compressstoreu_@vsuf2@(arr + left, _knot_mask16(gt_mask), curr_vec); + _mm512_mask_compressstoreu_@vsuf2@(arr + right - amount_gt_pivot, gt_mask, curr_vec); + *smallest_vec = _mm512_min_@vsuf1@(curr_vec, *smallest_vec); + *biggest_vec = _mm512_max_@vsuf1@(curr_vec, *biggest_vec); + return amount_gt_pivot; +} + +/* + * Parition an array based on the pivot and returns the index of the + * last element that is less than equal to the pivot. + */ +static NPY_INLINE +npy_intp partition_avx512_@vsuf1@(@type_t@* arr, npy_intp left, npy_intp right, + @type_t@ pivot, @type_t@* smallest, @type_t@* biggest) +{ + /* make array length divisible by 16 , shortening the array */ + for (npy_int i = (right - left) % 16; i > 0; --i) { + *smallest = MIN(*smallest, arr[left]); + *biggest = MAX(*biggest, arr[left]); + if (arr[left] > pivot) { + swap_@TYPE@(arr, left, --right); + } + else { + ++left; + } + } + + if(left == right) + return left; /* less than 16 elements in the array */ + + @zmm_t@ pivot_vec = _mm512_set1_@vsuf2@(pivot); + @zmm_t@ min_vec = _mm512_set1_@vsuf2@(*smallest); + @zmm_t@ max_vec = _mm512_set1_@vsuf2@(*biggest); + + if(right - left == 16) { + @zmm_t@ vec = _mm512_loadu_@vsuf3@(arr + left); + npy_int amount_gt_pivot = partition_vec_@vsuf1@(arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec); + *smallest = npyv_reducemin_@vsuf4@(min_vec); + *biggest = npyv_reducemax_@vsuf4@(max_vec); + return left + (16 - amount_gt_pivot); + } + + // first and last 16 values are partitioned at the end + @zmm_t@ vec_left = _mm512_loadu_@vsuf3@(arr + left); + @zmm_t@ vec_right = _mm512_loadu_@vsuf3@(arr + (right - 16)); + // store points of the vectors + npy_intp r_store = right - 16; + npy_intp l_store = left; + // indices for loading the elements + left += 16; + right -= 16; + while(right - left != 0) { + @zmm_t@ curr_vec; + /* + * if fewer elements are stored on the right side of the array, + * then next elements are loaded from the right side, + * otherwise from the left side + */ + if((r_store + 16) - right < left - l_store) { + right -= 16; + curr_vec = _mm512_loadu_@vsuf3@(arr + right); + } + else { + curr_vec = _mm512_loadu_@vsuf3@(arr + left); + left += 16; + } + // partition the current vector and save it on both sides of the array + npy_int amount_gt_pivot = partition_vec_@vsuf1@(arr, l_store, r_store + 16, curr_vec, pivot_vec, &min_vec, &max_vec);; + r_store -= amount_gt_pivot; l_store += (16 - amount_gt_pivot); + } + + /* partition and save vec_left and vec_right */ + npy_int amount_gt_pivot = partition_vec_@vsuf1@(arr, l_store, r_store + 16, vec_left, pivot_vec, &min_vec, &max_vec); + l_store += (16 - amount_gt_pivot); + amount_gt_pivot = partition_vec_@vsuf1@(arr, l_store, l_store + 16, vec_right, pivot_vec, &min_vec, &max_vec); + l_store += (16 - amount_gt_pivot); + *smallest = npyv_reducemin_@vsuf4@(min_vec); + *biggest = npyv_reducemax_@vsuf4@(max_vec); + return l_store; +} + +static NPY_INLINE +void qsort_@type@(@type_t@* arr, npy_intp left, npy_intp right, npy_int max_iters) +{ + /* + * Resort to heapsort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + heapsort_@type@((void*)(arr + left), right + 1 - left, NULL); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_@vsuf1@(arr + left, right + 1 -left); + return; + } + + @type_t@ pivot = get_pivot_@vsuf1@(arr, left, right); + @type_t@ smallest = @TYPE_MAX_VAL@; + @type_t@ biggest = @TYPE_MIN_VAL@; + npy_intp pivot_index = partition_avx512_@vsuf1@(arr, left, right+1, pivot, &smallest, &biggest); + if (pivot != smallest) + qsort_@type@(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) + qsort_@type@(arr, pivot_index, right, max_iters - 1); +} +/**end repeat**/ + +static NPY_INLINE +npy_intp replace_nan_with_inf(npy_float* arr, npy_intp arrsize) +{ + npy_intp nan_count = 0; + __mmask16 loadmask = 0xFFFF; + while (arrsize > 0) { + if (arrsize < 16) { + loadmask = (0x0001 << arrsize) - 0x0001; + } + __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr); + __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); + nan_count += _mm_popcnt_u32((npy_int) nanmask); + _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT); + arr += 16; + arrsize -= 16; + } + return nan_count; +} + +static NPY_INLINE +void replace_inf_with_nan(npy_float* arr, npy_intp arrsize, npy_intp nan_count) +{ + for (npy_intp ii = arrsize-1; nan_count > 0; --ii) { + arr[ii] = NPY_NANF; + nan_count -= 1; + } +} + +/**begin repeat + * + * #type = int, uint, float# + * #type_t = npy_int, npy_uint, npy_float# + * #FIXNAN = 0, 0, 1# + */ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(x86_quicksort_@type@) +(void* arr, npy_intp arrsize) +{ + if (arrsize > 1) { +#if @FIXNAN@ + npy_intp nan_count = replace_nan_with_inf((@type_t@*) arr, arrsize); +#endif + qsort_@type@((@type_t@*) arr, 0, arrsize-1, 2*log2(arrsize)); +#if @FIXNAN@ + replace_inf_with_nan((@type_t@*) arr, arrsize, nan_count); +#endif + } +} +/**end repeat**/ + +#endif // NPY_HAVE_AVX512_SKX diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h new file mode 100644 index 000000000..8cb8e3654 --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort.h @@ -0,0 +1,18 @@ +#include "numpy/npy_common.h" +#include "npy_cpu_dispatch.h" + +#ifndef NPY_NO_EXPORT + #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "x86-qsort.dispatch.h" +#endif +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float, + (void *start, npy_intp num)) diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 708e82910..73bb5e2d8 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -9278,3 +9278,52 @@ class TestViewDtype: [[1284, 1798], [4368, 4882]], [[2312, 2826], [5396, 5910]]] assert_array_equal(x.view('<i2'), expected) + + +# Test various array sizes that hit different code paths in quicksort-avx512 +@pytest.mark.parametrize("N", [8, 16, 24, 32, 48, 64, 96, 128, 151, 191, + 256, 383, 512, 1023, 2047]) +def test_sort_float(N): + # Regular data with nan sprinkled + np.random.seed(42) + arr = -0.5 + np.random.sample(N).astype('f') + arr[np.random.choice(arr.shape[0], 3)] = np.nan + assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) + + # (2) with +INF + infarr = np.inf*np.ones(N, dtype='f') + infarr[np.random.choice(infarr.shape[0], 5)] = -1.0 + assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) + + # (3) with -INF + neginfarr = -np.inf*np.ones(N, dtype='f') + neginfarr[np.random.choice(neginfarr.shape[0], 5)] = 1.0 + assert_equal(np.sort(neginfarr, kind='quick'), + np.sort(neginfarr, kind='heap')) + + # (4) with +/-INF + infarr = np.inf*np.ones(N, dtype='f') + infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf + assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) + + +def test_sort_int(): + # Random data with NPY_MAX_INT32 and NPY_MIN_INT32 sprinkled + rng = np.random.default_rng(42) + N = 2047 + minv = np.iinfo(np.int32).min + maxv = np.iinfo(np.int32).max + arr = rng.integers(low=minv, high=maxv, size=N).astype('int32') + arr[np.random.choice(arr.shape[0], 10)] = minv + arr[np.random.choice(arr.shape[0], 10)] = maxv + assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) + + +def test_sort_uint(): + # Random data with NPY_MAX_UINT32 sprinkled + rng = np.random.default_rng(42) + N = 2047 + maxv = np.iinfo(np.uint32).max + arr = rng.integers(low=0, high=maxv, size=N).astype('uint32') + arr[np.random.choice(arr.shape[0], 10)] = maxv + assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index d5b130b72..900538134 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4411,7 +4411,7 @@ def _check_interpolation_as_method(method, interpolation, fname): f"the `interpolation=` argument to {fname} was renamed to " "`method=`, which has additional options.\n" "Users of the modes 'nearest', 'lower', 'higher', or " - "'midpoint' are encouraged to review the method they. " + "'midpoint' are encouraged to review the method they used. " "(Deprecated NumPy 1.22)", DeprecationWarning, stacklevel=4) if method != "linear": diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 6818ef81d..90424aab4 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -5,6 +5,7 @@ import itertools import warnings import weakref import contextlib +import operator from operator import itemgetter, index as opindex, methodcaller from collections.abc import Mapping @@ -13,6 +14,7 @@ from . import format from ._datasource import DataSource from numpy.core import overrides from numpy.core.multiarray import packbits, unpackbits +from numpy.core._multiarray_umath import _load_from_filelike from numpy.core.overrides import set_array_function_like_doc, set_module from ._iotools import ( LineSplitter, NameValidator, StringConverter, ConverterError, @@ -721,101 +723,6 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None): zipf.close() -def _floatconv(x): - try: - return float(x) # The fastest path. - except ValueError: - if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10. - try: - return float.fromhex(x) - except ValueError: - pass - raise # Raise the original exception, which makes more sense. - - -_CONVERTERS = [ # These converters only ever get strs (not bytes) as input. - (np.bool_, lambda x: bool(int(x))), - (np.uint64, np.uint64), - (np.int64, np.int64), - (np.integer, lambda x: int(float(x))), - (np.longdouble, np.longdouble), - (np.floating, _floatconv), - (complex, lambda x: complex(x.replace('+-', '-'))), - (np.bytes_, methodcaller('encode', 'latin-1')), - (np.unicode_, str), -] - - -def _getconv(dtype): - """ - Find the correct dtype converter. Adapted from matplotlib. - - Even when a lambda is returned, it is defined at the toplevel, to allow - testing for equality and enabling optimization for single-type data. - """ - for base, conv in _CONVERTERS: - if issubclass(dtype.type, base): - return conv - return str - - -# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers -# lifted to the toplevel because recursive inner functions cause either -# GC-dependent reference loops (because they are closures over loadtxt's -# internal variables) or large overheads if using a manual trampoline to hide -# the recursive calls. - - -# not to be confused with the flatten_dtype we import... -def _loadtxt_flatten_dtype_internal(dt): - """Unpack a structured data-type, and produce a packer function.""" - if dt.names is None: - # If the dtype is flattened, return. - # If the dtype has a shape, the dtype occurs - # in the list more than once. - shape = dt.shape - if len(shape) == 0: - return ([dt.base], None) - else: - packing = [(shape[-1], list)] - if len(shape) > 1: - for dim in dt.shape[-2::-1]: - packing = [(dim*packing[0][0], packing*dim)] - return ([dt.base] * int(np.prod(dt.shape)), - functools.partial(_loadtxt_pack_items, packing)) - else: - types = [] - packing = [] - for field in dt.names: - tp, bytes = dt.fields[field] - flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp) - types.extend(flat_dt) - flat_packing = flat_packer.args[0] if flat_packer else None - # Avoid extra nesting for subarrays - if tp.ndim > 0: - packing.extend(flat_packing) - else: - packing.append((len(flat_dt), flat_packing)) - return (types, functools.partial(_loadtxt_pack_items, packing)) - - -def _loadtxt_pack_items(packing, items): - """Pack items into nested lists based on re-packing info.""" - if packing is None: - return items[0] - elif packing is tuple: - return tuple(items) - elif packing is list: - return list(items) - else: - start = 0 - ret = [] - for length, subpacking in packing: - ret.append( - _loadtxt_pack_items(subpacking, items[start:start+length])) - start += length - return tuple(ret) - def _ensure_ndmin_ndarray_check_param(ndmin): """Just checks if the param ndmin is supported on _ensure_ndmin_ndarray. It is intended to be used as @@ -853,17 +760,330 @@ def _ensure_ndmin_ndarray(a, *, ndmin: int): _loadtxt_chunksize = 50000 -def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None, - converters=None, skiprows=None, usecols=None, unpack=None, - ndmin=None, encoding=None, max_rows=None, *, like=None): +def _loadtxt_dispatcher( + fname, dtype=None, comments=None, delimiter=None, + converters=None, skiprows=None, usecols=None, unpack=None, + ndmin=None, encoding=None, max_rows=None, *, like=None): return (like,) +def _check_nonneg_int(value, name="argument"): + try: + operator.index(value) + except TypeError: + raise TypeError(f"{name} must be an integer") from None + if value < 0: + raise ValueError(f"{name} must be nonnegative") + + +def _preprocess_comments(iterable, comments, encoding): + """ + Generator that consumes a line iterated iterable and strips out the + multiple (or multi-character) comments from lines. + This is a pre-processing step to achieve feature parity with loadtxt + (we assume that this feature is a nieche feature). + """ + for line in iterable: + if isinstance(line, bytes): + # Need to handle conversion here, or the splitting would fail + line = line.decode(encoding) + + for c in comments: + line = line.split(c, 1)[0] + + yield line + + +# The number of rows we read in one go if confronted with a parametric dtype +_loadtxt_chunksize = 50000 + + +def _read(fname, *, delimiter=',', comment='#', quote='"', + imaginary_unit='j', usecols=None, skiplines=0, + max_rows=None, converters=None, ndmin=None, unpack=False, + dtype=np.float64, encoding="bytes"): + r""" + Read a NumPy array from a text file. + + Parameters + ---------- + fname : str or file object + The filename or the file to be read. + delimiter : str, optional + Field delimiter of the fields in line of the file. + Default is a comma, ','. If None any sequence of whitespace is + considered a delimiter. + comment : str or sequence of str or None, optional + Character that begins a comment. All text from the comment + character to the end of the line is ignored. + Multiple comments or multiple-character comment strings are supported, + but may be slower and `quote` must be empty if used. + Use None to disable all use of comments. + quote : str or None, optional + Character that is used to quote string fields. Default is '"' + (a double quote). Use None to disable quote support. + imaginary_unit : str, optional + Character that represent the imaginay unit `sqrt(-1)`. + Default is 'j'. + usecols : array_like, optional + A one-dimensional array of integer column numbers. These are the + columns from the file to be included in the array. If this value + is not given, all the columns are used. + skiplines : int, optional + Number of lines to skip before interpreting the data in the file. + max_rows : int, optional + Maximum number of rows of data to read. Default is to read the + entire file. + converters : dict or callable, optional + A function to parse all columns strings into the desired value, or + a dictionary mapping column number to a parser function. + E.g. if column 0 is a date string: ``converters = {0: datestr2num}``. + Converters can also be used to provide a default value for missing + data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will + convert empty fields to 0. + Default: None + ndmin : int, optional + Minimum dimension of the array returned. + Allowed values are 0, 1 or 2. Default is 0. + unpack : bool, optional + If True, the returned array is transposed, so that arguments may be + unpacked using ``x, y, z = read(...)``. When used with a structured + data-type, arrays are returned for each field. Default is False. + dtype : numpy data type + A NumPy dtype instance, can be a structured dtype to map to the + columns of the file. + encoding : str, optional + Encoding used to decode the inputfile. The special value 'bytes' + (the default) enables backwards-compatible behavior for `converters`, + ensuring that inputs to the converter functions are encoded + bytes objects. The special value 'bytes' has no additional effect if + ``converters=None``. If encoding is ``'bytes'`` or ``None``, the + default system encoding is used. + + Returns + ------- + ndarray + NumPy array. + + Examples + -------- + First we create a file for the example. + + >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n' + >>> with open('example1.csv', 'w') as f: + ... f.write(s1) + >>> a1 = read_from_filename('example1.csv') + >>> a1 + array([[1., 2., 3.], + [4., 5., 6.]]) + + The second example has columns with different data types, so a + one-dimensional array with a structured data type is returned. + The tab character is used as the field delimiter. + + >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n' + >>> with open('example2.tsv', 'w') as f: + ... f.write(s2) + >>> a2 = read_from_filename('example2.tsv', delimiter='\t') + >>> a2 + array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')], + dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')]) + """ + # Handle special 'bytes' keyword for encoding + byte_converters = False + if encoding == 'bytes': + encoding = None + byte_converters = True + + if dtype is None: + raise TypeError("a dtype must be provided.") + dtype = np.dtype(dtype) + + read_dtype_via_object_chunks = None + if dtype.kind in 'SUM' and ( + dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'): + # This is a legacy "flexible" dtype. We do not truly support + # parametric dtypes currently (no dtype discovery step in the core), + # but have to support these for backward compatibility. + read_dtype_via_object_chunks = dtype + dtype = np.dtype(object) + + if usecols is not None: + # Allow usecols to be a single int or a sequence of ints, the C-code + # handles the rest + try: + usecols = list(usecols) + except TypeError: + usecols = [usecols] + + _ensure_ndmin_ndarray_check_param(ndmin) + + if comment is None: + comments = None + else: + # assume comments are a sequence of strings + if "" in comment: + raise ValueError( + "comments cannot be an empty string. Use comments=None to " + "disable comments." + ) + comments = tuple(comment) + comment = None + if len(comments) == 0: + comments = None # No comments at all + elif len(comments) == 1: + # If there is only one comment, and that comment has one character, + # the normal parsing can deal with it just fine. + if isinstance(comments[0], str) and len(comments[0]) == 1: + comment = comments[0] + comments = None + else: + # Input validation if there are multiple comment characters + if delimiter in comments: + raise TypeError( + f"Comment characters '{comments}' cannot include the " + f"delimiter '{delimiter}'" + ) + + # comment is now either a 1 or 0 character string or a tuple: + if comments is not None: + # Note: An earlier version support two character comments (and could + # have been extended to multiple characters, we assume this is + # rare enough to not optimize for. + if quote is not None: + raise ValueError( + "when multiple comments or a multi-character comment is " + "given, quotes are not supported. In this case quotechar " + "must be set to None.") + + if len(imaginary_unit) != 1: + raise ValueError('len(imaginary_unit) must be 1.') + + _check_nonneg_int(skiplines) + if max_rows is not None: + _check_nonneg_int(max_rows) + else: + # Passing -1 to the C code means "read the entire file". + max_rows = -1 + + fh_closing_ctx = contextlib.nullcontext() + filelike = False + try: + if isinstance(fname, os.PathLike): + fname = os.fspath(fname) + if isinstance(fname, str): + fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) + if encoding is None: + encoding = getattr(fh, 'encoding', 'latin1') + + fh_closing_ctx = contextlib.closing(fh) + data = fh + filelike = True + else: + if encoding is None: + encoding = getattr(fname, 'encoding', 'latin1') + data = iter(fname) + except TypeError as e: + raise ValueError( + f"fname must be a string, filehandle, list of strings,\n" + f"or generator. Got {type(fname)} instead.") from e + + with fh_closing_ctx: + if comments is not None: + if filelike: + data = iter(data) + filelike = False + data = _preprocess_comments(data, comments, encoding) + + if read_dtype_via_object_chunks is None: + arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiplines=skiplines, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters) + + else: + # This branch reads the file into chunks of object arrays and then + # casts them to the desired actual dtype. This ensures correct + # string-length and datetime-unit discovery (like `arr.astype()`). + # Due to chunking, certain error reports are less clear, currently. + if filelike: + data = iter(data) # cannot chunk when reading from file + + c_byte_converters = False + if read_dtype_via_object_chunks == "S": + c_byte_converters = True # Use latin1 rather than ascii + + chunks = [] + while max_rows != 0: + if max_rows < 0: + chunk_size = _loadtxt_chunksize + else: + chunk_size = min(_loadtxt_chunksize, max_rows) + + next_arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiplines=skiplines, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters, + c_byte_converters=c_byte_converters) + # Cast here already. We hope that this is better even for + # large files because the storage is more compact. It could + # be adapted (in principle the concatenate could cast). + chunks.append(next_arr.astype(read_dtype_via_object_chunks)) + + skiprows = 0 # Only have to skip for first chunk + if max_rows >= 0: + max_rows -= chunk_size + if len(next_arr) < chunk_size: + # There was less data than requested, so we are done. + break + + # Need at least one chunk, but if empty, the last one may have + # the wrong shape. + if len(chunks) > 1 and len(chunks[-1]) == 0: + del chunks[-1] + if len(chunks) == 1: + arr = chunks[0] + else: + arr = np.concatenate(chunks, axis=0) + + # NOTE: ndmin works as advertised for structured dtypes, but normally + # these would return a 1D result plus the structured dimension, + # so ndmin=2 adds a third dimension even when no squeezing occurs. + # A `squeeze=False` could be a better solution (pandas uses squeeze). + arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin) + + if arr.shape: + if arr.shape[0] == 0: + warnings.warn( + f'loadtxt: input contained no data: "{fname}"', + category=UserWarning, + stacklevel=3 + ) + + if unpack: + # Unpack structured dtypes if requested: + dt = arr.dtype + if dt.names is not None: + # For structured arrays, return an array for each field. + return [arr[field] for field in dt.names] + else: + return arr.T + else: + return arr + + @set_array_function_like_doc @set_module('numpy') def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, - ndmin=0, encoding='bytes', max_rows=None, *, like=None): + ndmin=0, encoding='bytes', max_rows=None, *, quotechar=None, + like=None): r""" Load data from a text file. @@ -882,19 +1102,20 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type. - comments : str or sequence of str, optional + comments : str or sequence of str or None, optional The characters or list of characters used to indicate the start of a comment. None implies no comments. For backwards compatibility, byte strings will be decoded as 'latin1'. The default is '#'. delimiter : str, optional The string used to separate values. For backwards compatibility, byte strings will be decoded as 'latin1'. The default is whitespace. - converters : dict, optional - A dictionary mapping column number to a function that will parse the - column string into the desired value. E.g., if column 0 is a date - string: ``converters = {0: datestr2num}``. Converters can also be - used to provide a default value for missing data (but see also - `genfromtxt`): ``converters = {3: lambda s: float(s.strip() or 0)}``. + converters : dict or callable, optional + A function to parse all columns strings into the desired value, or + a dictionary mapping column number to a parser function. + E.g. if column 0 is a date string: ``converters = {0: datestr2num}``. + Converters can also be used to provide a default value for missing + data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will + convert empty fields to 0. Default: None. skiprows : int, optional Skip the first `skiprows` lines, including comments; default: 0. @@ -932,6 +1153,16 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, is to read all the lines. .. versionadded:: 1.16.0 + quotechar : unicode character or None, optional + The character used to denote the start and end of a quoted item. + Occurrences of the delimiter or comment characters are ignored within + a quoted item. The default value is ``quotechar=None``, which means + quoting support is disabled. + + If two consecutive instances of `quotechar` are found within a quoted + field, the first is treated as an escape character. See examples. + + .. versionadded:: 1.23.0 ${ARRAY_FUNCTION_LIKE} .. versionadded:: 1.20.0 @@ -979,6 +1210,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, >>> y array([2., 4.]) + The `converters` argument is used to specify functions to preprocess the + text prior to parsing. `converters` can be a dictionary that maps + preprocessing functions to each column: + + >>> s = StringIO("1.618, 2.296\n3.141, 4.669\n") + >>> conv = { + ... 0: lambda x: np.floor(float(x)), # conversion fn for column 0 + ... 1: lambda x: np.ceil(float(x)), # conversion fn for column 1 + ... } + >>> np.loadtxt(s, delimiter=",", converters=conv) + array([[1., 3.], + [3., 5.]]) + + `converters` can be a callable instead of a dictionary, in which case it + is applied to all columns: + + >>> s = StringIO("0xDE 0xAD\n0xC0 0xDE") + >>> import functools + >>> conv = functools.partial(int, base=16) + >>> np.loadtxt(s, converters=conv) + array([[222., 173.], + [192., 222.]]) + This example shows how `converters` can be used to convert a field with a trailing minus sign into a negative number. @@ -986,242 +1240,90 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, >>> def conv(fld): ... return -float(fld[:-1]) if fld.endswith(b'-') else float(fld) ... - >>> np.loadtxt(s, converters={0: conv, 1: conv}) + >>> np.loadtxt(s, converters=conv) array([[ 10.01, -31.25], [ 19.22, 64.31], [-17.57, 63.94]]) - """ - - if like is not None: - return _loadtxt_with_like( - fname, dtype=dtype, comments=comments, delimiter=delimiter, - converters=converters, skiprows=skiprows, usecols=usecols, - unpack=unpack, ndmin=ndmin, encoding=encoding, - max_rows=max_rows, like=like - ) - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Nested functions used by loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Using a callable as the converter can be particularly useful for handling + values with different formatting, e.g. floats with underscores: - def split_line(line: str): - """Chop off comments, strip, and split at delimiter.""" - for comment in comments: # Much faster than using a single regex. - line = line.split(comment, 1)[0] - line = line.strip('\r\n') - return line.split(delimiter) if line else [] + >>> s = StringIO("1 2.7 100_000") + >>> np.loadtxt(s, converters=float) + array([1.e+00, 2.7e+00, 1.e+05]) - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Main body of loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _ensure_ndmin_ndarray_check_param(ndmin) - - # Type conversions for Py3 convenience - if comments is not None: - if isinstance(comments, (str, bytes)): - comments = [comments] - comments = [_decode_line(x) for x in comments] - else: - comments = [] - - if delimiter is not None: - delimiter = _decode_line(delimiter) - - user_converters = converters - - byte_converters = False - if encoding == 'bytes': - encoding = None - byte_converters = True - - if usecols is not None: - # Copy usecols, allowing it to be a single int or a sequence of ints. - try: - usecols = list(usecols) - except TypeError: - usecols = [usecols] - for i, col_idx in enumerate(usecols): - try: - usecols[i] = opindex(col_idx) # Cast to builtin int now. - except TypeError as e: - e.args = ( - "usecols must be an int or a sequence of ints but " - "it contains at least one element of type %s" % - type(col_idx), - ) - raise - if len(usecols) > 1: - usecols_getter = itemgetter(*usecols) - else: - # Get an iterable back, even if using a single column. - usecols_getter = lambda obj, c=usecols[0]: [obj[c]] - else: - usecols_getter = None + This idea can be extended to automatically handle values specified in + many different formats: - # Make sure we're dealing with a proper dtype - dtype = np.dtype(dtype) - defconv = _getconv(dtype) + >>> def conv(val): + ... try: + ... return float(val) + ... except ValueError: + ... return float.fromhex(val) + >>> s = StringIO("1, 2.5, 3_000, 0b4, 0x1.4000000000000p+2") + >>> np.loadtxt(s, delimiter=",", converters=conv, encoding=None) + array([1.0e+00, 2.5e+00, 3.0e+03, 1.8e+02, 5.0e+00]) - dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype) + Note that with the default ``encoding="bytes"``, the inputs to the + converter function are latin-1 encoded byte strings. To deactivate the + implicit encoding prior to conversion, use ``encoding=None`` - fh_closing_ctx = contextlib.nullcontext() - try: - if isinstance(fname, os_PathLike): - fname = os_fspath(fname) - if _is_string_like(fname): - fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) - fencoding = getattr(fh, 'encoding', 'latin1') - line_iter = iter(fh) - fh_closing_ctx = contextlib.closing(fh) - else: - line_iter = iter(fname) - fencoding = getattr(fname, 'encoding', 'latin1') - try: - first_line = next(line_iter) - except StopIteration: - pass # Nothing matters if line_iter is empty. - else: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - if isinstance(first_line, bytes): - # Using latin1 matches _decode_line's behavior. - decoder = methodcaller( - "decode", - encoding if encoding is not None else "latin1") - line_iter = map(decoder, line_iter) - except TypeError as e: - raise ValueError( - f"fname must be a string, filehandle, list of strings,\n" - f"or generator. Got {type(fname)} instead." - ) from e + >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94') + >>> conv = lambda x: -float(x[:-1]) if x.endswith('-') else float(x) + >>> np.loadtxt(s, converters=conv, encoding=None) + array([[ 10.01, -31.25], + [ 19.22, 64.31], + [-17.57, 63.94]]) - with fh_closing_ctx: + Support for quoted fields is enabled with the `quotechar` parameter. + Comment and delimiter characters are ignored when they appear within a + quoted item delineated by `quotechar`: - # input may be a python2 io stream - if encoding is not None: - fencoding = encoding - # we must assume local encoding - # TODO emit portability warning? - elif fencoding is None: - import locale - fencoding = locale.getpreferredencoding() - - # Skip the first `skiprows` lines - for i in range(skiprows): - next(line_iter) - - # Read until we find a line with some values, and use it to determine - # the need for decoding and estimate the number of columns. - for first_line in line_iter: - ncols = len(usecols or split_line(first_line)) - if ncols: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - break - else: # End of lines reached - ncols = len(usecols or []) - warnings.warn('loadtxt: Empty input file: "%s"' % fname, - stacklevel=2) - - line_iter = itertools.islice(line_iter, max_rows) - lineno_words_iter = filter( - itemgetter(1), # item[1] is words; filter skips empty lines. - enumerate(map(split_line, line_iter), 1 + skiprows)) - - # Now that we know ncols, create the default converters list, and - # set packing, if necessary. - if len(dtype_types) > 1: - # We're dealing with a structured array, each field of - # the dtype matches a column - converters = [_getconv(dt) for dt in dtype_types] - else: - # All fields have the same dtype; use specialized packers which are - # much faster than those using _loadtxt_pack_items. - converters = [defconv for i in range(ncols)] - if ncols == 1: - packer = itemgetter(0) - else: - def packer(row): return row + >>> s = StringIO('"alpha, #42", 10.0\n"beta, #64", 2.0\n') + >>> dtype = np.dtype([("label", "U12"), ("value", float)]) + >>> np.loadtxt(s, dtype=dtype, delimiter=",", quotechar='"') + array([('alpha, #42', 10.), ('beta, #64', 2.)], + dtype=[('label', '<U12'), ('value', '<f8')]) - # By preference, use the converters specified by the user - for i, conv in (user_converters or {}).items(): - if usecols: - try: - i = usecols.index(i) - except ValueError: - # Unused converter specified - continue - if byte_converters: - # converters may use decode to workaround numpy's old - # behaviour, so encode the string again (converters are only - # called with strings) before passing to the user converter. - def tobytes_first(conv, x): - return conv(x.encode("latin1")) - converters[i] = functools.partial(tobytes_first, conv) - else: - converters[i] = conv - - fencode = methodcaller("encode", fencoding) - converters = [conv if conv is not bytes else fencode - for conv in converters] - if len(set(converters)) == 1: - # Optimize single-type data. Note that this is only reached if - # `_getconv` returns equal callables (i.e. not local lambdas) on - # equal dtypes. - def convert_row(vals, _conv=converters[0]): - return [*map(_conv, vals)] - else: - def convert_row(vals): - return [conv(val) for conv, val in zip(converters, vals)] - - # read data in chunks and fill it into an array via resize - # over-allocating and shrinking the array later may be faster but is - # probably not relevant compared to the cost of actually reading and - # converting the data - X = None - while True: - chunk = [] - for lineno, words in itertools.islice( - lineno_words_iter, _loadtxt_chunksize): - if usecols_getter is not None: - words = usecols_getter(words) - elif len(words) != ncols: - raise ValueError( - f"Wrong number of columns at line {lineno}") - # Convert each value according to its column, then pack it - # according to the dtype's nesting, and store it. - chunk.append(packer(convert_row(words))) - if not chunk: # The islice is empty, i.e. we're done. - break + Two consecutive quote characters within a quoted field are treated as a + single escaped character: - if X is None: - X = np.array(chunk, dtype) - else: - nshape = list(X.shape) - pos = nshape[0] - nshape[0] += len(chunk) - X.resize(nshape, refcheck=False) - X[pos:, ...] = chunk + >>> s = StringIO('"Hello, my name is ""Monty""!"') + >>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"') + array('Hello, my name is "Monty"!', dtype='<U26') - if X is None: - X = np.array([], dtype) + """ - # Multicolumn data are returned with shape (1, N, M), i.e. - # (1, 1, M) for a single row - remove the singleton dimension there - if X.ndim == 3 and X.shape[:2] == (1, 1): - X.shape = (1, -1) + if like is not None: + return _loadtxt_with_like( + fname, dtype=dtype, comments=comments, delimiter=delimiter, + converters=converters, skiprows=skiprows, usecols=usecols, + unpack=unpack, ndmin=ndmin, encoding=encoding, + max_rows=max_rows, like=like + ) - X = _ensure_ndmin_ndarray(X, ndmin=ndmin) + if isinstance(delimiter, bytes): + delimiter.decode("latin1") - if unpack: - if len(dtype_types) > 1: - # For structured arrays, return an array for each field. - return [X[field] for field in dtype.names] - else: - return X.T - else: - return X + if dtype is None: + dtype = np.float64 + + comment = comments + # Control character type conversions for Py3 convenience + if comment is not None: + if isinstance(comment, (str, bytes)): + comment = [comment] + comment = [ + x.decode('latin1') if isinstance(x, bytes) else x for x in comment] + if isinstance(delimiter, bytes): + delimiter = delimiter.decode('latin1') + + arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, + converters=converters, skiplines=skiprows, usecols=usecols, + unpack=unpack, ndmin=ndmin, encoding=encoding, + max_rows=max_rows, quote=quotechar) + + return arr _loadtxt_with_like = array_function_dispatch( diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index b9b10bc06..a2758123b 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -695,7 +695,7 @@ class TestLoadTxt(LoadTxtBase): assert_array_equal(x, a) d = TextIO() - d.write('M 64.0 75.0\nF 25.0 60.0') + d.write('M 64 75.0\nF 25 60.0') d.seek(0) mydescriptor = {'names': ('gender', 'age', 'weight'), 'formats': ('S1', 'i4', 'f4')} @@ -779,6 +779,8 @@ class TestLoadTxt(LoadTxtBase): a = np.array([[1, 2, 3], [4, 5, 6]], int) assert_array_equal(x, a) + @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_comments_multi_chars(self): c = TextIO() c.write('/* comment\n1,2,3,5\n') @@ -871,16 +873,27 @@ class TestLoadTxt(LoadTxtBase): bogus_idx = 1.5 assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx), + '^usecols must be.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=bogus_idx ) assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx), + '^usecols must be.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=[0, bogus_idx, 0] ) + def test_bad_usecols(self): + with pytest.raises(OverflowError): + np.loadtxt(["1\n"], usecols=[2**64], delimiter=",") + with pytest.raises((ValueError, OverflowError)): + # Overflow error on 32bit platforms + np.loadtxt(["1\n"], usecols=[2**62], delimiter=",") + with pytest.raises(TypeError, + match="If a structured dtype .*. But 1 usecols were given and " + "the number of fields is 3."): + np.loadtxt(["1,1\n"], dtype="i,(2)i", usecols=[0], delimiter=",") + def test_fancy_dtype(self): c = TextIO() c.write('1,2,3.0\n4,5,6.0\n') @@ -919,8 +932,7 @@ class TestLoadTxt(LoadTxtBase): assert_array_equal(x, a) def test_empty_file(self): - with suppress_warnings() as sup: - sup.filter(message="loadtxt: Empty input file:") + with pytest.warns(UserWarning, match="input contained no data"): c = TextIO() x = np.loadtxt(c) assert_equal(x.shape, (0,)) @@ -981,29 +993,32 @@ class TestLoadTxt(LoadTxtBase): c.write(inp) for dt in [float, np.float32]: c.seek(0) - res = np.loadtxt(c, dtype=dt) + res = np.loadtxt( + c, dtype=dt, converters=float.fromhex, encoding="latin1") assert_equal(res, tgt, err_msg="%s" % dt) + @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_default_float_converter_no_default_hex_conversion(self): """ Ensure that fromhex is only used for values with the correct prefix and is not called by default. Regression test related to gh-19598. """ c = TextIO("a b c") - with pytest.raises( - ValueError, match="could not convert string to float" - ): + with pytest.raises(ValueError, + match=".*convert string 'a' to float64 at row 0, column 1"): np.loadtxt(c) + @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_default_float_converter_exception(self): """ Ensure that the exception message raised during failed floating point conversion is correct. Regression test related to gh-19598. """ c = TextIO("qrs tuv") # Invalid values for default float converter - with pytest.raises( - ValueError, match="could not convert string to float" - ): + with pytest.raises(ValueError, + match="could not convert string 'qrs' to float64"): np.loadtxt(c) def test_from_complex(self): @@ -1099,8 +1114,7 @@ class TestLoadTxt(LoadTxtBase): assert_(x.shape == (3,)) # Test ndmin kw with empty file. - with suppress_warnings() as sup: - sup.filter(message="loadtxt: Empty input file:") + with pytest.warns(UserWarning, match="input contained no data"): f = TextIO() assert_(np.loadtxt(f, ndmin=2).shape == (0, 1,)) assert_(np.loadtxt(f, ndmin=1).shape == (0,)) @@ -1132,8 +1146,8 @@ class TestLoadTxt(LoadTxtBase): @pytest.mark.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968', reason="Wrong preferred encoding") def test_binary_load(self): - butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\ - b"20,2,3,\xc3\x95scar\n\r" + butf8 = b"5,6,7,\xc3\x95scarscar\r\n15,2,3,hello\r\n"\ + b"20,2,3,\xc3\x95scar\r\n" sutf8 = butf8.decode("UTF-8").replace("\r", "").splitlines() with temppath() as path: with open(path, "wb") as f: @@ -1196,6 +1210,30 @@ class TestLoadTxt(LoadTxtBase): a = np.array([[1, 2, 3, 5], [4, 5, 7, 8], [2, 1, 4, 5]], int) assert_array_equal(x, a) + @pytest.mark.parametrize(["skip", "data"], [ + (1, ["ignored\n", "1,2\n", "\n", "3,4\n"]), + # "Bad" lines that do not end in newlines: + (1, ["ignored", "1,2", "", "3,4"]), + (1, StringIO("ignored\n1,2\n\n3,4")), + # Same as above, but do not skip any lines: + (0, ["-1,0\n", "1,2\n", "\n", "3,4\n"]), + (0, ["-1,0", "1,2", "", "3,4"]), + (0, StringIO("-1,0\n1,2\n\n3,4"))]) + def test_max_rows_empty_lines(self, skip, data): + with pytest.warns(UserWarning, + match=f"Input line 3.*max_rows={3-skip}"): + res = np.loadtxt(data, dtype=int, skiprows=skip, delimiter=",", + max_rows=3-skip) + assert_array_equal(res, [[-1, 0], [1, 2], [3, 4]][skip:]) + + if isinstance(data, StringIO): + data.seek(0) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + with pytest.raises(UserWarning): + np.loadtxt(data, dtype=int, skiprows=skip, delimiter=",", + max_rows=3-skip) class Testfromregex: def test_record(self): @@ -2397,6 +2435,13 @@ M 33 21.99 assert_equal(test['f1'], 17179869184) assert_equal(test['f2'], 1024) + def test_unpack_float_data(self): + txt = TextIO("1,2,3\n4,5,6\n7,8,9\n0.0,1.0,2.0") + a, b, c = np.loadtxt(txt, delimiter=",", unpack=True) + assert_array_equal(a, np.array([1.0, 4.0, 7.0, 0.0])) + assert_array_equal(b, np.array([2.0, 5.0, 8.0, 1.0])) + assert_array_equal(c, np.array([3.0, 6.0, 9.0, 2.0])) + def test_unpack_structured(self): # Regression test for gh-4341 # Unpacking should work on structured arrays diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py new file mode 100644 index 000000000..cca328b16 --- /dev/null +++ b/numpy/lib/tests/test_loadtxt.py @@ -0,0 +1,1002 @@ +""" +Tests specific to `np.loadtxt` added during the move of loadtxt to be backed +by C code. +These tests complement those found in `test_io.py`. +""" + +import sys +import pytest +from tempfile import NamedTemporaryFile, mkstemp +from io import StringIO + +import numpy as np +from numpy.ma.testutils import assert_equal +from numpy.testing import assert_array_equal, HAS_REFCOUNT, IS_PYPY + + +def test_scientific_notation(): + """Test that both 'e' and 'E' are parsed correctly.""" + data = StringIO( + ( + "1.0e-1,2.0E1,3.0\n" + "4.0e-2,5.0E-1,6.0\n" + "7.0e-3,8.0E1,9.0\n" + "0.0e-4,1.0E-1,2.0" + ) + ) + expected = np.array( + [[0.1, 20., 3.0], [0.04, 0.5, 6], [0.007, 80., 9], [0, 0.1, 2]] + ) + assert_array_equal(np.loadtxt(data, delimiter=","), expected) + + +@pytest.mark.parametrize("comment", ["..", "//", "@-", "this is a comment:"]) +def test_comment_multiple_chars(comment): + content = "# IGNORE\n1.5, 2.5# ABC\n3.0,4.0# XXX\n5.5,6.0\n" + txt = StringIO(content.replace("#", comment)) + a = np.loadtxt(txt, delimiter=",", comments=comment) + assert_equal(a, [[1.5, 2.5], [3.0, 4.0], [5.5, 6.0]]) + + +@pytest.fixture +def mixed_types_structured(): + """ + Fixture providing hetergeneous input data with a structured dtype, along + with the associated structured array. + """ + data = StringIO( + ( + "1000;2.4;alpha;-34\n" + "2000;3.1;beta;29\n" + "3500;9.9;gamma;120\n" + "4090;8.1;delta;0\n" + "5001;4.4;epsilon;-99\n" + "6543;7.8;omega;-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + return data, dtype, expected + + +@pytest.mark.parametrize('skiprows', [0, 1, 2, 3]) +def test_structured_dtype_and_skiprows_no_empty_lines( + skiprows, mixed_types_structured): + data, dtype, expected = mixed_types_structured + a = np.loadtxt(data, dtype=dtype, delimiter=";", skiprows=skiprows) + assert_array_equal(a, expected[skiprows:]) + + +def test_unpack_structured(mixed_types_structured): + data, dtype, expected = mixed_types_structured + + a, b, c, d = np.loadtxt(data, dtype=dtype, delimiter=";", unpack=True) + assert_array_equal(a, expected["f0"]) + assert_array_equal(b, expected["f1"]) + assert_array_equal(c, expected["f2"]) + assert_array_equal(d, expected["f3"]) + + +def test_structured_dtype_with_shape(): + dtype = np.dtype([("a", "u1", 2), ("b", "u1", 2)]) + data = StringIO("0,1,2,3\n6,7,8,9\n") + expected = np.array([((0, 1), (2, 3)), ((6, 7), (8, 9))], dtype=dtype) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dtype), expected) + + +def test_structured_dtype_with_multi_shape(): + dtype = np.dtype([("a", "u1", (2, 2))]) + data = StringIO("0 1 2 3\n") + expected = np.array([(((0, 1), (2, 3)),)], dtype=dtype) + assert_array_equal(np.loadtxt(data, dtype=dtype), expected) + + +def test_nested_structured_subarray(): + # Test from gh-16678 + point = np.dtype([('x', float), ('y', float)]) + dt = np.dtype([('code', int), ('points', point, (2,))]) + data = StringIO("100,1,2,3,4\n200,5,6,7,8\n") + expected = np.array( + [ + (100, [(1., 2.), (3., 4.)]), + (200, [(5., 6.), (7., 8.)]), + ], + dtype=dt + ) + assert_array_equal(np.loadtxt(data, dtype=dt, delimiter=","), expected) + + +def test_structured_dtype_offsets(): + # An aligned structured dtype will have additional padding + dt = np.dtype("i1, i4, i1, i4, i1, i4", align=True) + data = StringIO("1,2,3,4,5,6\n7,8,9,10,11,12\n") + expected = np.array([(1, 2, 3, 4, 5, 6), (7, 8, 9, 10, 11, 12)], dtype=dt) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dt), expected) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_exception_negative_row_limits(param): + """skiprows and max_rows should raise for negative parameters.""" + with pytest.raises(ValueError, match="argument must be nonnegative"): + np.loadtxt("foo.bar", **{param: -3}) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_exception_noninteger_row_limits(param): + with pytest.raises(TypeError, match="argument must be an integer"): + np.loadtxt("foo.bar", **{param: 1.0}) + + +@pytest.mark.parametrize( + "data, shape", + [ + ("1 2 3 4 5\n", (1, 5)), # Single row + ("1\n2\n3\n4\n5\n", (5, 1)), # Single column + ] +) +def test_ndmin_single_row_or_col(data, shape): + arr = np.array([1, 2, 3, 4, 5]) + arr2d = arr.reshape(shape) + + assert_array_equal(np.loadtxt(StringIO(data), dtype=int), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=0), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=1), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=2), arr2d) + + +@pytest.mark.parametrize("badval", [-1, 3, None, "plate of shrimp"]) +def test_bad_ndmin(badval): + with pytest.raises(ValueError, match="Illegal value of ndmin keyword"): + np.loadtxt("foo.bar", ndmin=badval) + + +@pytest.mark.parametrize( + "ws", + ( + "\t", # tab + "\u2003", # em + "\u00A0", # non-break + "\u3000", # ideographic space + ) +) +def test_blank_lines_spaces_delimit(ws): + txt = StringIO( + f"1 2{ws}30\n\n4 5 60\n {ws} \n7 8 {ws} 90\n # comment\n3 2 1" + ) + # NOTE: It is unclear that the ` # comment` should succeed. Except + # for delimiter=None, which should use any whitespace (and maybe + # should just be implemented closer to Python + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter=None, comments="#"), expected + ) + + +def test_blank_lines_normal_delimiter(): + txt = StringIO('1,2,30\n\n4,5,60\n\n7,8,90\n# comment\n3,2,1') + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter=',', comments="#"), expected + ) + + +@pytest.mark.parametrize("dtype", (float, object)) +def test_maxrows_no_blank_lines(dtype): + txt = StringIO("1.5,2.5\n3.0,4.0\n5.5,6.0") + res = np.loadtxt(txt, dtype=dtype, delimiter=",", max_rows=2) + assert_equal(res.dtype, dtype) + assert_equal(res, np.array([["1.5", "2.5"], ["3.0", "4.0"]], dtype=dtype)) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2"))) +def test_exception_message_bad_values(dtype): + txt = StringIO("1,2\n3,XXX\n5,6") + msg = f"could not convert string 'XXX' to {dtype} at row 1, column 2" + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + +def test_converters_negative_indices(): + txt = StringIO('1.5,2.5\n3.0,XXX\n5.5,6.0') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 2.5], [3.0, np.nan], [5.5, 6.0]]) + res = np.loadtxt( + txt, dtype=np.float64, delimiter=",", converters=conv, encoding=None + ) + assert_equal(res, expected) + + +def test_converters_negative_indices_with_usecols(): + txt = StringIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]]) + res = np.loadtxt( + txt, + dtype=np.float64, + delimiter=",", + converters=conv, + usecols=[0, -1], + encoding=None, + ) + assert_equal(res, expected) + + # Second test with variable number of rows: + res = np.loadtxt(StringIO('''0,1,2\n0,1,2,3,4'''), delimiter=",", + usecols=[0, -1], converters={-1: (lambda x: -1)}) + assert_array_equal(res, [[0, -1], [0, -1]]) + +def test_ragged_usecols(): + # usecols, and negative ones, work even with varying number of columns. + txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + expected = np.array([[0, 0], [0, 0], [0, 0]]) + res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + assert_equal(res, expected) + + txt = StringIO("0,0,XXX\n0\n0,XXX,XXX,0,XXX\n") + with pytest.raises(ValueError, + match="invalid column index -2 at row 1 with 2 columns"): + # There is no -2 column in the second row: + np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + + +def test_empty_usecols(): + txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=[]) + assert res.shape == (3,) + assert res.dtype == np.dtype([]) + + +@pytest.mark.parametrize("c1", ["a", "の", "🫕"]) +@pytest.mark.parametrize("c2", ["a", "の", "🫕"]) +def test_large_unicode_characters(c1, c2): + # c1 and c2 span ascii, 16bit and 32bit range. + txt = StringIO(f"a,{c1},c,1.0\ne,{c2},2.0,g") + res = np.loadtxt(txt, dtype=np.dtype('U12'), delimiter=",") + expected = np.array( + [f"a,{c1},c,1.0".split(","), f"e,{c2},2.0,g".split(",")], + dtype=np.dtype('U12') + ) + assert_equal(res, expected) + + +def test_unicode_with_converter(): + txt = StringIO("cat,dog\nαβγ,δεζ\nabc,def\n") + conv = {0: lambda s: s.upper()} + res = np.loadtxt( + txt, + dtype=np.dtype("U12"), + converters=conv, + delimiter=",", + encoding=None + ) + expected = np.array([['CAT', 'dog'], ['ΑΒΓ', 'δεζ'], ['ABC', 'def']]) + assert_equal(res, expected) + + +def test_converter_with_structured_dtype(): + txt = StringIO('1.5,2.5,Abc\n3.0,4.0,dEf\n5.5,6.0,ghI\n') + dt = np.dtype([('m', np.int32), ('r', np.float32), ('code', 'U8')]) + conv = {0: lambda s: int(10*float(s)), -1: lambda s: s.upper()} + res = np.loadtxt(txt, dtype=dt, delimiter=",", converters=conv) + expected = np.array( + [(15, 2.5, 'ABC'), (30, 4.0, 'DEF'), (55, 6.0, 'GHI')], dtype=dt + ) + assert_equal(res, expected) + + +def test_converter_with_unicode_dtype(): + """ + With the default 'bytes' encoding, tokens are encoded prior to being + passed to the converter. This means that the output of the converter may + be bytes instead of unicode as expected by `read_rows`. + + This test checks that outputs from the above scenario are properly decoded + prior to parsing by `read_rows`. + """ + txt = StringIO('abc,def\nrst,xyz') + conv = bytes.upper + res = np.loadtxt( + txt, dtype=np.dtype("U3"), converters=conv, delimiter=",") + expected = np.array([['ABC', 'DEF'], ['RST', 'XYZ']]) + assert_equal(res, expected) + + +def test_read_huge_row(): + row = "1.5, 2.5," * 50000 + row = row[:-1] + "\n" + txt = StringIO(row * 2) + res = np.loadtxt(txt, delimiter=",", dtype=float) + assert_equal(res, np.tile([1.5, 2.5], (2, 50000))) + + +@pytest.mark.parametrize("dtype", "edfgFDG") +def test_huge_float(dtype): + # Covers a non-optimized path that is rarely taken: + field = "0" * 1000 + ".123456789" + dtype = np.dtype(dtype) + value = np.loadtxt([field], dtype=dtype)[()] + assert value == dtype.type("0.123456789") + + +@pytest.mark.parametrize( + ("given_dtype", "expected_dtype"), + [ + ("S", np.dtype("S5")), + ("U", np.dtype("U5")), + ], +) +def test_string_no_length_given(given_dtype, expected_dtype): + """ + The given dtype is just 'S' or 'U' with no length. In these cases, the + length of the resulting dtype is determined by the longest string found + in the file. + """ + txt = StringIO("AAA,5-1\nBBBBB,0-3\nC,4-9\n") + res = np.loadtxt(txt, dtype=given_dtype, delimiter=",") + expected = np.array( + [['AAA', '5-1'], ['BBBBB', '0-3'], ['C', '4-9']], dtype=expected_dtype + ) + assert_equal(res, expected) + assert_equal(res.dtype, expected_dtype) + + +def test_float_conversion(): + """ + Some tests that the conversion to float64 works as accurately as the + Python built-in `float` function. In a naive version of the float parser, + these strings resulted in values that were off by an ULP or two. + """ + strings = [ + '0.9999999999999999', + '9876543210.123456', + '5.43215432154321e+300', + '0.901', + '0.333', + ] + txt = StringIO('\n'.join(strings)) + res = np.loadtxt(txt) + expected = np.array([float(s) for s in strings]) + assert_equal(res, expected) + + +def test_bool(): + # Simple test for bool via integer + txt = StringIO("1, 0\n10, -1") + res = np.loadtxt(txt, dtype=bool, delimiter=",") + assert res.dtype == bool + assert_array_equal(res, [[True, False], [True, True]]) + # Make sure we use only 1 and 0 on the byte level: + assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_integer_signs(dtype): + dtype = np.dtype(dtype) + assert np.loadtxt(["+2"], dtype=dtype) == 2 + if dtype.kind == "u": + with pytest.raises(ValueError): + np.loadtxt(["-1\n"], dtype=dtype) + else: + assert np.loadtxt(["-2\n"], dtype=dtype) == -2 + + for sign in ["++", "+-", "--", "-+"]: + with pytest.raises(ValueError): + np.loadtxt([f"{sign}2\n"], dtype=dtype) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_implicit_cast_float_to_int_fails(dtype): + txt = StringIO("1.0, 2.1, 3.7\n4, 5, 6") + with pytest.raises(ValueError): + np.loadtxt(txt, dtype=dtype, delimiter=",") + +@pytest.mark.parametrize("dtype", (np.complex64, np.complex128)) +@pytest.mark.parametrize("with_parens", (False, True)) +def test_complex_parsing(dtype, with_parens): + s = "(1.0-2.5j),3.75,(7+-5.0j)\n(4),(-19e2j),(0)" + if not with_parens: + s = s.replace("(", "").replace(")", "") + + res = np.loadtxt(StringIO(s), dtype=dtype, delimiter=",") + expected = np.array( + [[1.0-2.5j, 3.75, 7-5j], [4.0, -1900j, 0]], dtype=dtype + ) + assert_equal(res, expected) + + +def test_read_from_generator(): + def gen(): + for i in range(4): + yield f"{i},{2*i},{i**2}" + + res = np.loadtxt(gen(), dtype=int, delimiter=",") + expected = np.array([[0, 0, 0], [1, 2, 1], [2, 4, 4], [3, 6, 9]]) + assert_equal(res, expected) + + +def test_read_from_generator_multitype(): + def gen(): + for i in range(3): + yield f"{i} {i / 4}" + + res = np.loadtxt(gen(), dtype="i, d", delimiter=" ") + expected = np.array([(0, 0.0), (1, 0.25), (2, 0.5)], dtype="i, d") + assert_equal(res, expected) + + +def test_read_from_bad_generator(): + def gen(): + for entry in ["1,2", b"3, 5", 12738]: + yield entry + + with pytest.raises( + TypeError, match=r"non-string returned while reading data"): + np.loadtxt(gen(), dtype="i, i", delimiter=",") + + +@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") +def test_object_cleanup_on_read_error(): + sentinel = object() + already_read = 0 + + def conv(x): + nonlocal already_read + if already_read > 4999: + raise ValueError("failed half-way through!") + already_read += 1 + return sentinel + + txt = StringIO("x\n" * 10000) + + with pytest.raises(ValueError, match="at row 5000, column 1"): + np.loadtxt(txt, dtype=object, converters={0: conv}) + + assert sys.getrefcount(sentinel) == 2 + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +def test_character_not_bytes_compatible(): + """Test exception when a character cannot be encoded as 'S'.""" + data = StringIO("–") # == \u2013 + with pytest.raises(ValueError): + np.loadtxt(data, dtype="S5") + + +@pytest.mark.parametrize("conv", (0, [float], "")) +def test_invalid_converter(conv): + msg = ( + "converters must be a dictionary mapping columns to converter " + "functions or a single callable." + ) + with pytest.raises(TypeError, match=msg): + np.loadtxt(StringIO("1 2\n3 4"), converters=conv) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +def test_converters_dict_raises_non_integer_key(): + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}) + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}, usecols=0) + + +@pytest.mark.parametrize("bad_col_ind", (3, -3)) +def test_converters_dict_raises_non_col_key(bad_col_ind): + data = StringIO("1 2\n3 4") + with pytest.raises(ValueError, match="converter specified for column"): + np.loadtxt(data, converters={bad_col_ind: int}) + + +def test_converters_dict_raises_val_not_callable(): + with pytest.raises(TypeError, + match="values of the converters dictionary must be callable"): + np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1}) + + +@pytest.mark.parametrize("q", ('"', "'", "`")) +def test_quoted_field(q): + txt = StringIO( + f"{q}alpha, x{q}, 2.5\n{q}beta, y{q}, 4.5\n{q}gamma, z{q}, 5.0\n" + ) + dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)]) + expected = np.array( + [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype + ) + + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q) + assert_array_equal(res, expected) + + +def test_quote_support_default(): + """Support for quoted fields is disabled by default.""" + txt = StringIO('"lat,long", 45, 30\n') + dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)]) + + with pytest.raises(ValueError, match="the number of columns changed"): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + # Enable quoting support with non-None value for quotechar param + txt.seek(0) + expected = np.array([("lat,long", 45., 30.)], dtype=dtype) + + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, expected) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +def test_quotechar_multichar_error(): + txt = StringIO("1,2\n3,4") + msg = r".*must be a single unicode character or None" + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, delimiter=",", quotechar="''") + + +def test_comment_multichar_error_with_quote(): + txt = StringIO("1,2\n3,4") + msg = ( + "when multiple comments or a multi-character comment is given, " + "quotes are not supported." + ) + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments="123", quotechar='"') + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments=["#", "%"], quotechar='"') + + # A single character string in a tuple is unpacked though: + res = np.loadtxt(txt, delimiter=",", comments=("#",), quotechar="'") + assert_equal(res, [[1, 2], [3, 4]]) + + +def test_structured_dtype_with_quotes(): + data = StringIO( + ( + "1000;2.4;'alpha';-34\n" + "2000;3.1;'beta';29\n" + "3500;9.9;'gamma';120\n" + "4090;8.1;'delta';0\n" + "5001;4.4;'epsilon';-99\n" + "6543;7.8;'omega';-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + res = np.loadtxt(data, dtype=dtype, delimiter=";", quotechar="'") + assert_array_equal(res, expected) + + +def test_quoted_field_is_not_empty(): + txt = StringIO('1\n\n"4"\n""') + expected = np.array(["1", "4", ""], dtype="U1") + res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') + assert_equal(res, expected) + +def test_quoted_field_is_not_empty_nonstrict(): + # Same as test_quoted_field_is_not_empty but check that we are not strict + # about missing closing quote (this is the `csv.reader` default also) + txt = StringIO('1\n\n"4"\n"') + expected = np.array(["1", "4", ""], dtype="U1") + res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') + assert_equal(res, expected) + +def test_consecutive_quotechar_escaped(): + txt = StringIO('"Hello, my name is ""Monty""!"') + expected = np.array('Hello, my name is "Monty"!', dtype="U40") + res = np.loadtxt(txt, dtype="U40", delimiter=",", quotechar='"') + assert_equal(res, expected) + + +@pytest.mark.parametrize("data", ("", "\n\n\n", "# 1 2 3\n# 4 5 6\n")) +@pytest.mark.parametrize("ndmin", (0, 1, 2)) +@pytest.mark.parametrize("usecols", [None, (1, 2, 3)]) +def test_warn_on_no_data(data, ndmin, usecols): + """Check that a UserWarning is emitted when no data is read from input.""" + if usecols is not None: + expected_shape = (0, 3) + elif ndmin == 2: + expected_shape = (0, 1) # guess a single column?! + else: + expected_shape = (0,) + + txt = StringIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape + + with NamedTemporaryFile(mode="w") as fh: + fh.write(data) + fh.seek(0) + with pytest.warns(UserWarning, match="input contained no data"): + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape + +@pytest.mark.parametrize("skiprows", (2, 3)) +def test_warn_on_skipped_data(skiprows): + data = "1 2 3\n4 5 6" + txt = StringIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + np.loadtxt(txt, skiprows=skiprows) + + +@pytest.mark.parametrize(["dtype", "value"], [ + ("i2", 0x0001), ("u2", 0x0001), + ("i4", 0x00010203), ("u4", 0x00010203), + ("i8", 0x0001020304050607), ("u8", 0x0001020304050607), + # The following values are constructed to lead to unique bytes: + ("float16", 3.07e-05), + ("float32", 9.2557e-41), ("complex64", 9.2557e-41+2.8622554e-29j), + ("float64", -1.758571353180402e-24), + # Here and below, the repr side-steps a small loss of precision in + # complex `str` in PyPy (which is probably fine, as repr works): + ("complex128", repr(5.406409232372729e-29-1.758571353180402e-24j)), + # Use integer values that fit into double. Everything else leads to + # problems due to longdoubles going via double and decimal strings + # causing rounding errors. + ("longdouble", 0x01020304050607), + ("clongdouble", repr(0x01020304050607 + (0x00121314151617 * 1j))), + ("U2", "\U00010203\U000a0b0c")]) +@pytest.mark.parametrize("swap", [True, False]) +def test_byteswapping_and_unaligned(dtype, value, swap): + # Try to create "interesting" values within the valid unicode range: + dtype = np.dtype(dtype) + data = [f"x,{value}\n"] # repr as PyPy `str` truncates some + if swap: + dtype = dtype.newbyteorder() + full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False) + # The above ensures that the interesting "b" field is unaligned: + assert full_dt.fields["b"][1] == 1 + res = np.loadtxt(data, dtype=full_dt, delimiter=",", encoding=None, + max_rows=1) # max-rows prevents over-allocation + assert res["b"] == dtype.type(value) + + +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efdFD" + "?") +def test_unicode_whitespace_stripping(dtype): + # Test that all numeric types (and bool) strip whitespace correctly + # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted. + # Currently, skip float128 as it did not always support this and has no + # "custom" parsing: + txt = StringIO(' 3 ,"\u202F2\n"') + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, np.array([3, 2]).astype(dtype)) + + +@pytest.mark.parametrize("dtype", "FD") +def test_unicode_whitespace_stripping_complex(dtype): + # Complex has a few extra cases since it has two components and + # parentheses + line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n" + data = [line, line.replace(" ", "\u202F")] + res = np.loadtxt(data, dtype=dtype, delimiter=',') + assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2)) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", "FD") +@pytest.mark.parametrize("field", + ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"]) +def test_bad_complex(dtype, field): + with pytest.raises(ValueError): + np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efgdFDG" + "?") +def test_nul_character_error(dtype): + # Test that a \0 character is correctly recognized as an error even if + # what comes before is valid (not everything gets parsed internally). + if dtype.lower() == "g": + pytest.xfail("longdouble/clongdouble assignment may misbehave.") + with pytest.raises(ValueError): + np.loadtxt(["1\000"], dtype=dtype, delimiter=",", quotechar='"') + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efgdFDG" + "?") +def test_no_thousands_support(dtype): + # Mainly to document behaviour, Python supports thousands like 1_1. + # (e and G may end up using different conversion and support it, this is + # a bug but happens...) + if dtype == "e": + pytest.skip("half assignment currently uses Python float converter") + if dtype in "eG": + pytest.xfail("clongdouble assignment is buggy (uses `complex`?).") + + assert int("1_1") == float("1_1") == complex("1_1") == 11 + with pytest.raises(ValueError): + np.loadtxt(["1_1\n"], dtype=dtype) + + +@pytest.mark.parametrize("data", [ + ["1,2\n", "2\n,3\n"], + ["1,2\n", "2\r,3\n"]]) +def test_bad_newline_in_iterator(data): + # In NumPy <=1.22 this was accepted, because newlines were completely + # ignored when the input was an iterable. This could be changed, but right + # now, we raise an error. + msg = "Found an unquoted embedded newline within a single line" + with pytest.raises(ValueError, match=msg): + np.loadtxt(data, delimiter=",") + + +@pytest.mark.parametrize("data", [ + ["1,2\n", "2,3\r\n"], # a universal newline + ["1,2\n", "'2\n',3\n"], # a quoted newline + ["1,2\n", "'2\r',3\n"], + ["1,2\n", "'2\r\n',3\n"], +]) +def test_good_newline_in_iterator(data): + # The quoted newlines will be untransformed here, but are just whitespace. + res = np.loadtxt(data, delimiter=",", quotechar="'") + assert_array_equal(res, [[1., 2.], [2., 3.]]) + + +@pytest.mark.parametrize("newline", ["\n", "\r", "\r\n"]) +def test_universal_newlines_quoted(newline): + # Check that universal newline support within the tokenizer is not applied + # to quoted fields. (note that lines must end in newline or quoted + # fields will not include a newline at all) + data = ['1,"2\n"\n', '3,"4\n', '1"\n'] + data = [row.replace("\n", newline) for row in data] + res = np.loadtxt(data, dtype=object, delimiter=",", quotechar='"') + assert_array_equal(res, [['1', f'2{newline}'], ['3', f'4{newline}1']]) + + +def test_null_character(): + # Basic tests to check that the NUL character is not special: + res = np.loadtxt(["1\0002\0003\n", "4\0005\0006"], delimiter="\000") + assert_array_equal(res, [[1, 2, 3], [4, 5, 6]]) + + # Also not as part of a field (avoid unicode/arrays as unicode strips \0) + res = np.loadtxt(["1\000,2\000,3\n", "4\000,5\000,6"], + delimiter=",", dtype=object) + assert res.tolist() == [["1\000", "2\000", "3"], ["4\000", "5\000", "6"]] + + +def test_iterator_fails_getting_next_line(): + class BadSequence: + def __len__(self): + return 100 + + def __getitem__(self, item): + if item == 50: + raise RuntimeError("Bad things happened!") + return f"{item}, {item+1}" + + with pytest.raises(RuntimeError, match="Bad things happened!"): + np.loadtxt(BadSequence(), dtype=int, delimiter=",") + + +class TestCReaderUnitTests: + # These are internal tests for path that should not be possible to hit + # unless things go very very wrong somewhere. + def test_not_an_filelike(self): + with pytest.raises(AttributeError, match=".*read"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=True) + + def test_filelike_read_fails(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + class BadFileLike: + counter = 0 + + def read(self, size): + self.counter += 1 + if self.counter > 20: + raise RuntimeError("Bad bad bad!") + return "1,2,3\n" + + with pytest.raises(RuntimeError, match="Bad bad bad!"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + + def test_filelike_bad_read(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + + class BadFileLike: + counter = 0 + + def read(self, size): + return 1234 # not a string! + + with pytest.raises(TypeError, + match="non-string returned while reading data"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + + def test_not_an_iter(self): + with pytest.raises(TypeError, + match="error reading from object, expected an iterable"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False) + + def test_bad_type(self): + with pytest.raises(TypeError, match="internal error: dtype must"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype="i", filelike=False) + + def test_bad_encoding(self): + with pytest.raises(TypeError, match="encoding must be a unicode"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False, encoding=123) + + @pytest.mark.parametrize("newline", ["\r", "\n", "\r\n"]) + def test_manual_universal_newlines(self, newline): + # This is currently not available to users, because we should always + # open files with universal newlines enabled `newlines=None`. + # (And reading from an iterator uses slightly different code paths.) + # We have no real support for `newline="\r"` or `newline="\n" as the + # user cannot specify those options. + data = StringIO('0\n1\n"2\n"\n3\n4 #\n'.replace("\n", newline), + newline="") + + res = np.core._multiarray_umath._load_from_filelike( + data, dtype=np.dtype("U10"), filelike=True, + quote='"', comment="#", skiplines=1) + assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "]) + + +def test_delimiter_comment_collision_raises(): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=",") + + +def test_delimiter_quotechar_collision_raises(): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", quotechar=",") + + +def test_comment_quotechar_collision_raises(): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO("1 2 3"), comments="#", quotechar="#") + + +def test_delimiter_and_multiple_comments_collision_raises(): + with pytest.raises( + TypeError, match="Comment characters.*cannot include the delimiter" + ): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=["#", ","]) + + +@pytest.mark.parametrize( + "ws", + ( + " ", # space + "\t", # tab + "\u2003", # em + "\u00A0", # non-break + "\u3000", # ideographic space + ) +) +def test_collision_with_default_delimiter_raises(ws): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), comments=ws) + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), quotechar=ws) + + +@pytest.mark.parametrize("nl", ("\n", "\r")) +def test_control_character_newline_raises(nl): + txt = StringIO(f"1{nl}2{nl}3{nl}{nl}4{nl}5{nl}6{nl}{nl}") + msg = "control character.*cannot be a newline" + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, delimiter=nl) + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, comments=nl) + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, quotechar=nl) + + +@pytest.mark.parametrize( + ("generic_data", "long_datum", "unitless_dtype", "expected_dtype"), + [ + ("2012-03", "2013-01-15", "M8", "M8[D]"), # Datetimes + ("spam-a-lot", "tis_but_a_scratch", "U", "U17"), # str + ], +) +@pytest.mark.parametrize("nrows", (10, 50000, 60000)) # lt, eq, gt chunksize +def test_parametric_unit_discovery( + generic_data, long_datum, unitless_dtype, expected_dtype, nrows +): + """Check that the correct unit (e.g. month, day, second) is discovered from + the data when a user specifies a unitless datetime.""" + # Unit should be "D" (days) due to last entry + data = [generic_data] * 50000 + [long_datum] + expected = np.array(data, dtype=expected_dtype) + + # file-like path + txt = StringIO("\n".join(data)) + a = np.loadtxt(txt, dtype=unitless_dtype) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + # file-obj path + fd, fname = mkstemp() + with open(fname, "w") as fh: + fh.write("\n".join(data)) + a = np.loadtxt(fname, dtype=unitless_dtype) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + +def test_str_dtype_unit_discovery_with_converter(): + data = ["spam-a-lot"] * 60000 + ["XXXtis_but_a_scratch"] + expected = np.array( + ["spam-a-lot"] * 60000 + ["tis_but_a_scratch"], dtype="U17" + ) + conv = lambda s: s.strip("XXX") + + # file-like path + txt = StringIO("\n".join(data)) + a = np.loadtxt(txt, dtype="U", converters=conv, encoding=None) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + # file-obj path + fd, fname = mkstemp() + with open(fname, "w") as fh: + fh.write("\n".join(data)) + a = np.loadtxt(fname, dtype="U", converters=conv, encoding=None) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +def test_control_character_empty(): + with pytest.raises(TypeError, match="Text reading control character must"): + np.loadtxt(StringIO("1 2 3"), delimiter="") + with pytest.raises(TypeError, match="Text reading control character must"): + np.loadtxt(StringIO("1 2 3"), quotechar="") + with pytest.raises(ValueError, match="comments cannot be an empty string"): + np.loadtxt(StringIO("1 2 3"), comments="") + with pytest.raises(ValueError, match="comments cannot be an empty string"): + np.loadtxt(StringIO("1 2 3"), comments=["#", ""]) + + +def test_control_characters_as_bytes(): + """Byte control characters (comments, delimiter) are supported.""" + a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",") + assert_equal(a, [1, 2, 3]) diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py index 2c71e45bd..80a6fdd10 100644 --- a/numpy/testing/_private/utils.py +++ b/numpy/testing/_private/utils.py @@ -810,7 +810,7 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True, header='', 'Mismatched elements: {} / {} ({:.3g}%)'.format( n_mismatch, n_elements, percent_mismatch)] - with errstate(invalid='ignore', divide='ignore'): + with errstate(all='ignore'): # ignore errors for non-numeric types with contextlib.suppress(TypeError): error = abs(x - y) diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py index 31d2cdc76..919ca751f 100644 --- a/numpy/testing/tests/test_utils.py +++ b/numpy/testing/tests/test_utils.py @@ -207,6 +207,14 @@ class TestArrayEqual(_GenericTest): self._test_not_equal(a, b) self._test_not_equal(b, a) + def test_suppress_overflow_warnings(self): + # Based on issue #18992 + with pytest.raises(AssertionError): + with np.errstate(all="raise"): + np.testing.assert_array_equal( + np.array([1, 2, 3], np.float32), + np.array([1, 1e-40, 3], np.float32)) + class TestBuildErrorMessage: |
