diff options
Diffstat (limited to 'numpy/core')
27 files changed, 3499 insertions, 0 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py index a13480907..9704cff0a 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -869,6 +869,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'typeinfo.h'), join('src', 'multiarray', 'usertypes.h'), join('src', 'multiarray', 'vdot.h'), + join('src', 'multiarray', 'textreading', 'readtext.h'), join('include', 'numpy', 'arrayobject.h'), join('include', 'numpy', '_neighborhood_iterator_imp.h'), join('include', 'numpy', 'npy_endian.h'), @@ -947,6 +948,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'usertypes.c'), join('src', 'multiarray', 'vdot.c'), join('src', 'common', 'npy_sort.h.src'), + join('src', 'npysort', 'x86-qsort.dispatch.c.src'), join('src', 'npysort', 'quicksort.c.src'), join('src', 'npysort', 'mergesort.cpp'), join('src', 'npysort', 'timsort.cpp'), @@ -956,6 +958,14 @@ def configuration(parent_package='',top_path=None): join('src', 'npysort', 'selection.cpp'), join('src', 'common', 'npy_binsearch.h'), join('src', 'npysort', 'binsearch.cpp'), + join('src', 'multiarray', 'textreading', 'conversions.c'), + join('src', 'multiarray', 'textreading', 'field_types.c'), + join('src', 'multiarray', 'textreading', 'growth.c'), + join('src', 'multiarray', 'textreading', 'readtext.c'), + join('src', 'multiarray', 'textreading', 'rows.c'), + join('src', 'multiarray', 'textreading', 'stream_pyobject.c'), + join('src', 'multiarray', 'textreading', 'str_to_int.c'), + join('src', 'multiarray', 'textreading', 'tokenize.c.src'), ] ####################################################################### diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index f8632e701..93e9d9d45 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -371,7 +371,79 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) #define npyv_sum_u64 _mm512_reduce_add_epi64 #define npyv_sum_f32 _mm512_reduce_add_ps #define npyv_sum_f64 _mm512_reduce_add_pd + #define npyv_reducemin_u32 _mm512_reduce_min_epu32 + #define npyv_reducemin_s32 _mm512_reduce_min_epi32 + #define npyv_reducemin_f32 _mm512_reduce_min_ps + #define npyv_reducemax_u32 _mm512_reduce_max_epu32 + #define npyv_reducemax_s32 _mm512_reduce_max_epi32 + #define npyv_reducemax_f32 _mm512_reduce_max_ps #else + NPY_FINLINE npy_uint32 npyv_reducemax_u32(npyv_u32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_u32 a1 = _mm512_max_epu32(a, _mm512_permutex2var_epi32(a, idx1, a)); + npyv_u32 a2 = _mm512_max_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); + npyv_u32 a3 = _mm512_max_epu32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_u32 a4 = _mm512_max_epu32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); + } + + NPY_FINLINE npy_int32 npyv_reducemax_s32(npyv_s32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_s32 a1 = _mm512_max_epi32(a, _mm512_permutex2var_epi32(a, idx1, a)); + npyv_s32 a2 = _mm512_max_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); + npyv_s32 a3 = _mm512_max_epi32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_s32 a4 = _mm512_max_epi32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); + } + + NPY_FINLINE npy_float npyv_reducemax_f32(npyv_f32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_f32 a1 = _mm512_max_ps(a, _mm512_permutex2var_ps(a, idx1, a)); + npyv_f32 a2 = _mm512_max_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1)); + npyv_f32 a3 = _mm512_max_ps(a2, _mm512_shuffle_ps(a2, a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_f32 a4 = _mm512_max_ps(a3, _mm512_shuffle_sp(a3, a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00)); + } + + NPY_FINLINE npy_uint32 npyv_reducemin_u32(npyv_u32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_u32 a1 = _mm512_min_epu32(a, _mm512_permutex2var_epi32(a, idx1, a)); + npyv_u32 a2 = _mm512_min_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); + npyv_u32 a3 = _mm512_min_epu32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_u32 a4 = _mm512_min_epu32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); + } + + NPY_FINLINE npy_int32 npyv_reducemin_s32(npyv_s32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_s32 a1 = _mm512_min_epi32(a, _mm512_permutex2var_epi32(a, idx1, a)); + npyv_s32 a2 = _mm512_min_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); + npyv_s32 a3 = _mm512_min_epi32(a2, _mm512_shuffle_epi32(a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_s32 a4 = _mm512_min_epi32(a3, _mm512_shuffle_epi32(a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); + } + + NPY_FINLINE npy_float npyv_reducemin_f32(npyv_f32 a) + { + const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + npyv_f32 a1 = _mm512_min_ps(a, _mm512_permutex2var_ps(a, idx1, a)); + npyv_f32 a2 = _mm512_min_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1)); + npyv_f32 a3 = _mm512_min_ps(a2, _mm512_shuffle_ps(a2, a2, (1<<6 | 0<<4 | 3<<2 | 2))); + npyv_f32 a4 = _mm512_min_ps(a3, _mm512_shuffle_sp(a3, a3, (2<<6 | 3<<4 | 0<<2 | 1))); + return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00)); + } + NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { __m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a)); diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c index a1de580d9..e4eb4f49e 100644 --- a/numpy/core/src/multiarray/conversion_utils.c +++ b/numpy/core/src/multiarray/conversion_utils.c @@ -993,6 +993,17 @@ PyArray_PyIntAsIntp(PyObject *o) } +NPY_NO_EXPORT int +PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val) +{ + *val = PyArray_PyIntAsIntp(o); + if (error_converting(*val)) { + return NPY_FAIL; + } + return NPY_SUCCEED; +} + + /* * PyArray_IntpFromIndexSequence * Returns the number of dimensions or -1 if an error occurred. diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h index 4072841ee..4d0fbb894 100644 --- a/numpy/core/src/multiarray/conversion_utils.h +++ b/numpy/core/src/multiarray/conversion_utils.h @@ -7,6 +7,9 @@ NPY_NO_EXPORT int PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq); NPY_NO_EXPORT int +PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val); + +NPY_NO_EXPORT int PyArray_OptionalIntpConverter(PyObject *obj, PyArray_Dims *seq); typedef enum { diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 789446d0c..a7b6898e1 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -69,6 +69,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0; #include "get_attr_string.h" #include "experimental_public_dtype_api.h" /* _get_experimental_dtype_api */ +#include "textreading/readtext.h" /* _readtext_from_file_object */ #include "npy_dlpack.h" @@ -4456,6 +4457,8 @@ static struct PyMethodDef array_module_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api, METH_O, NULL}, + {"_load_from_filelike", (PyCFunction)_load_from_filelike, + METH_FASTCALL | METH_KEYWORDS, NULL}, /* from umath */ {"frompyfunc", (PyCFunction) ufunc_frompyfunc, diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c new file mode 100644 index 000000000..11f4210f7 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -0,0 +1,395 @@ + +#include <Python.h> + +#include <string.h> +#include <stdlib.h> +#include <stdbool.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "lowlevel_strided_loops.h" + +#include "conversions.h" +#include "str_to_int.h" + +#include "array_coercion.h" + + +/* + * Coercion to boolean is done via integer right now. + */ +NPY_NO_EXPORT int +to_bool(PyArray_Descr *NPY_UNUSED(descr), + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + int64_t res; + if (str_to_int64(str, end, INT64_MIN, INT64_MAX, &res) < 0) { + return -1; + } + *dataptr = (char)(res != 0); + return 0; +} + + +/* + * In order to not pack a whole copy of a floating point parser, we copy the + * result into ascii and call the Python one. Float parsing isn't super quick + * so this is not terrible, but avoiding it would speed up things. + * + * Also note that parsing the first float of a complex will copy the whole + * string to ascii rather than just the first part. + * TODO: A tweak of the break might be a simple mitigation there. + * + * @param str The UCS4 string to parse + * @param end Pointer to the end of the string + * @param skip_trailing_whitespace If false does not skip trailing whitespace + * (used by the complex parser). + * @param result Output stored as double value. + */ +static NPY_INLINE int +double_from_ucs4( + const Py_UCS4 *str, const Py_UCS4 *end, + bool strip_whitespace, double *result, const Py_UCS4 **p_end) +{ + /* skip leading whitespace */ + if (strip_whitespace) { + while (Py_UNICODE_ISSPACE(*str)) { + str++; + } + } + if (str == end) { + return -1; /* empty or only whitespace: not a floating point number */ + } + + /* We convert to ASCII for the Python parser, use stack if small: */ + char stack_buf[128]; + char *heap_buf = NULL; + char *ascii = stack_buf; + + size_t str_len = end - str + 1; + if (str_len > 128) { + heap_buf = PyMem_MALLOC(str_len); + if (heap_buf == NULL) { + PyErr_NoMemory(); + return -1; + } + ascii = heap_buf; + } + char *c = ascii; + for (; str < end; str++, c++) { + if (NPY_UNLIKELY(*str >= 128)) { + /* Character cannot be used, ignore for end calculation and stop */ + end = str; + break; + } + *c = (char)(*str); + } + *c = '\0'; + + char *end_parsed; + *result = PyOS_string_to_double(ascii, &end_parsed, NULL); + /* Rewind `end` to the first UCS4 character not parsed: */ + end = end - (c - end_parsed); + + PyMem_FREE(heap_buf); + + if (*result == -1. && PyErr_Occurred()) { + return -1; + } + + if (strip_whitespace) { + /* and then skip any remainig whitespace: */ + while (Py_UNICODE_ISSPACE(*end)) { + end++; + } + } + *p_end = end; + return 0; +} + + +NPY_NO_EXPORT int +to_float(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + double double_val; + const Py_UCS4 *p_end; + if (double_from_ucs4(str, end, true, &double_val, &p_end) < 0) { + return -1; + } + if (p_end != end) { + return -1; + } + + float val = (float)double_val; + memcpy(dataptr, &val, sizeof(float)); + if (!PyArray_ISNBO(descr->byteorder)) { + npy_bswap4_unaligned(dataptr); + } + return 0; +} + + +NPY_NO_EXPORT int +to_double(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + double val; + const Py_UCS4 *p_end; + if (double_from_ucs4(str, end, true, &val, &p_end) < 0) { + return -1; + } + if (p_end != end) { + return -1; + } + + memcpy(dataptr, &val, sizeof(double)); + if (!PyArray_ISNBO(descr->byteorder)) { + npy_bswap8_unaligned(dataptr); + } + return 0; +} + + +static bool +to_complex_int( + const Py_UCS4 *item, const Py_UCS4 *token_end, + double *p_real, double *p_imag, + Py_UCS4 imaginary_unit, bool allow_parens) +{ + const Py_UCS4 *p_end; + bool unmatched_opening_paren = false; + + /* Remove whitespace before the possibly leading '(' */ + while (Py_UNICODE_ISSPACE(*item)) { + ++item; + } + if (allow_parens && (*item == '(')) { + unmatched_opening_paren = true; + ++item; + /* Allow whitespace within the parentheses: "( 1j)" */ + while (Py_UNICODE_ISSPACE(*item)) { + ++item; + } + } + if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) { + return false; + } + if (p_end == token_end) { + // No imaginary part in the string (e.g. "3.5") + *p_imag = 0.0; + return !unmatched_opening_paren; + } + if (*p_end == imaginary_unit) { + /* Only an imaginary part (e.g "1.5j") */ + *p_imag = *p_real; + *p_real = 0.0; + ++p_end; + } + else if (*p_end == '+' || *p_end == '-') { + /* Imaginary part still to parse */ + if (*p_end == '+') { + ++p_end; /* Advance to support +- (and ++) */ + } + if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) { + return false; + } + if (*p_end != imaginary_unit) { + return false; + } + ++p_end; + } + else { + *p_imag = 0; + } + + if (unmatched_opening_paren) { + /* Allow whitespace inside brackets as in "(1+2j )" or "( 1j )" */ + while (Py_UNICODE_ISSPACE(*p_end)) { + ++p_end; + } + if (*p_end == ')') { + ++p_end; + } + else { + /* parentheses was not closed */ + return false; + } + } + + while (Py_UNICODE_ISSPACE(*p_end)) { + ++p_end; + } + return p_end == token_end; +} + + +NPY_NO_EXPORT int +to_cfloat(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig) +{ + double real; + double imag; + + bool success = to_complex_int( + str, end, &real, &imag, + pconfig->imaginary_unit, true); + + if (!success) { + return -1; + } + npy_complex64 val = {(float)real, (float)imag}; + memcpy(dataptr, &val, sizeof(npy_complex64)); + if (!PyArray_ISNBO(descr->byteorder)) { + npy_bswap4_unaligned(dataptr); + npy_bswap4_unaligned(dataptr + 4); + } + return 0; +} + + +NPY_NO_EXPORT int +to_cdouble(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig) +{ + double real; + double imag; + + bool success = to_complex_int( + str, end, &real, &imag, pconfig->imaginary_unit, true); + + if (!success) { + return -1; + } + npy_complex128 val = {real, imag}; + memcpy(dataptr, &val, sizeof(npy_complex128)); + if (!PyArray_ISNBO(descr->byteorder)) { + npy_bswap8_unaligned(dataptr); + npy_bswap8_unaligned(dataptr + 8); + } + return 0; +} + + +/* + * String and unicode conversion functions. + */ +NPY_NO_EXPORT int +to_string(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(unused)) +{ + const Py_UCS4* c = str; + size_t length = descr->elsize; + + for (size_t i = 0; i < length; i++) { + if (c < end) { + /* + * loadtxt assumed latin1, which is compatible with UCS1 (first + * 256 unicode characters). + */ + if (NPY_UNLIKELY(*c > 255)) { + /* TODO: Was UnicodeDecodeError, is unspecific error good? */ + return -1; + } + dataptr[i] = (Py_UCS1)(*c); + c++; + } + else { + dataptr[i] = '\0'; + } + } + return 0; +} + + +NPY_NO_EXPORT int +to_unicode(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(unused)) +{ + int length = descr->elsize / 4; + + if (length <= end - str) { + memcpy(dataptr, str, length * 4); + } + else { + size_t given_len = end - str; + memcpy(dataptr, str, given_len * 4); + memset(dataptr + given_len * 4, '\0', (length - given_len) * 4); + } + + if (!PyArray_ISNBO(descr->byteorder)) { + for (int i = 0; i < length; i++) { + npy_bswap4_unaligned(dataptr); + dataptr += 4; + } + } + return 0; +} + + + +/* + * Convert functions helper for the generic converter. + */ +static PyObject * +call_converter_function( + PyObject *func, const Py_UCS4 *str, size_t length, bool byte_converters) +{ + PyObject *s = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str, length); + if (s == NULL) { + return s; + } + if (byte_converters) { + Py_SETREF(s, PyUnicode_AsEncodedString(s, "latin1", NULL)); + if (s == NULL) { + return NULL; + } + } + if (func == NULL) { + return s; + } + PyObject *result = PyObject_CallFunctionObjArgs(func, s, NULL); + Py_DECREF(s); + return result; +} + + +NPY_NO_EXPORT int +to_generic_with_converter(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *config, PyObject *func) +{ + bool use_byte_converter; + if (func == NULL) { + use_byte_converter = config->c_byte_converters; + } + else { + use_byte_converter = config->python_byte_converters; + } + /* Converts to unicode and calls custom converter (if set) */ + PyObject *converted = call_converter_function( + func, str, (size_t)(end - str), use_byte_converter); + if (converted == NULL) { + return -1; + } + + int res = PyArray_Pack(descr, dataptr, converted); + Py_DECREF(converted); + return res; +} + + +NPY_NO_EXPORT int +to_generic(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *config) +{ + return to_generic_with_converter(descr, str, end, dataptr, config, NULL); +} diff --git a/numpy/core/src/multiarray/textreading/conversions.h b/numpy/core/src/multiarray/textreading/conversions.h new file mode 100644 index 000000000..222eea4e7 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/conversions.h @@ -0,0 +1,57 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ + +#include <stdbool.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" + +#include "textreading/parser_config.h" + +NPY_NO_EXPORT int +to_bool(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_float(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_double(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_cfloat(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_cdouble(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +NPY_NO_EXPORT int +to_string(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused); + +NPY_NO_EXPORT int +to_unicode(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused); + +NPY_NO_EXPORT int +to_generic_with_converter(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused, PyObject *func); + +NPY_NO_EXPORT int +to_generic(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ */ diff --git a/numpy/core/src/multiarray/textreading/field_types.c b/numpy/core/src/multiarray/textreading/field_types.c new file mode 100644 index 000000000..0722efd57 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/field_types.c @@ -0,0 +1,201 @@ +#include "field_types.h" +#include "conversions.h" +#include "str_to_int.h" + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" +#include "alloc.h" + +#include "textreading/growth.h" + + +NPY_NO_EXPORT void +field_types_xclear(int num_field_types, field_type *ft) { + assert(num_field_types >= 0); + if (ft == NULL) { + return; + } + for (int i = 0; i < num_field_types; i++) { + Py_XDECREF(ft[i].descr); + ft[i].descr = NULL; + } + PyMem_Free(ft); +} + + +/* + * Fetch custom converters for the builtin NumPy DTypes (or the generic one). + * Structured DTypes get unpacked and `object` uses the generic method. + * + * TODO: This should probably be moved on the DType object in some form, + * to allow user DTypes to define their own converters. + */ +static set_from_ucs4_function * +get_from_ucs4_function(PyArray_Descr *descr) +{ + if (descr->type_num == NPY_BOOL) { + return &to_bool; + } + else if (PyDataType_ISSIGNED(descr)) { + switch (descr->elsize) { + case 1: + return &to_int8; + case 2: + return &to_int16; + case 4: + return &to_int32; + case 8: + return &to_int64; + default: + assert(0); + } + } + else if (PyDataType_ISUNSIGNED(descr)) { + switch (descr->elsize) { + case 1: + return &to_uint8; + case 2: + return &to_uint16; + case 4: + return &to_uint32; + case 8: + return &to_uint64; + default: + assert(0); + } + } + else if (descr->type_num == NPY_FLOAT) { + return &to_float; + } + else if (descr->type_num == NPY_DOUBLE) { + return &to_double; + } + else if (descr->type_num == NPY_CFLOAT) { + return &to_cfloat; + } + else if (descr->type_num == NPY_CDOUBLE) { + return &to_cdouble; + } + else if (descr->type_num == NPY_STRING) { + return &to_string; + } + else if (descr->type_num == NPY_UNICODE) { + return &to_unicode; + } + return &to_generic; +} + + +/* + * Note that the function cleans up `ft` on error. If `num_field_types < 0` + * cleanup has already happened in the internal call. + */ +static npy_intp +field_type_grow_recursive(PyArray_Descr *descr, + npy_intp num_field_types, field_type **ft, npy_intp *ft_size, + npy_intp field_offset) +{ + if (PyDataType_HASSUBARRAY(descr)) { + PyArray_Dims shape = {NULL, -1}; + + if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) { + PyErr_SetString(PyExc_ValueError, "invalid subarray shape"); + field_types_xclear(num_field_types, *ft); + return -1; + } + npy_intp size = PyArray_MultiplyList(shape.ptr, shape.len); + npy_free_cache_dim_obj(shape); + for (npy_intp i = 0; i < size; i++) { + num_field_types = field_type_grow_recursive(descr->subarray->base, + num_field_types, ft, ft_size, field_offset); + field_offset += descr->subarray->base->elsize; + if (num_field_types < 0) { + return -1; + } + } + return num_field_types; + } + else if (PyDataType_HASFIELDS(descr)) { + npy_int num_descr_fields = PyTuple_Size(descr->names); + if (num_descr_fields < 0) { + field_types_xclear(num_field_types, *ft); + return -1; + } + for (npy_intp i = 0; i < num_descr_fields; i++) { + PyObject *key = PyTuple_GET_ITEM(descr->names, i); + PyObject *tup = PyObject_GetItem(descr->fields, key); + if (tup == NULL) { + field_types_xclear(num_field_types, *ft); + return -1; + } + PyArray_Descr *field_descr; + PyObject *title; + int offset; + if (!PyArg_ParseTuple(tup, "Oi|O", &field_descr, &offset, &title)) { + Py_DECREF(tup); + field_types_xclear(num_field_types, *ft); + return -1; + } + Py_DECREF(tup); + num_field_types = field_type_grow_recursive( + field_descr, num_field_types, ft, ft_size, + field_offset + offset); + if (num_field_types < 0) { + return -1; + } + } + return num_field_types; + } + + if (*ft_size <= num_field_types) { + npy_intp alloc_size = grow_size_and_multiply( + ft_size, 4, sizeof(field_type)); + if (alloc_size < 0) { + field_types_xclear(num_field_types, *ft); + return -1; + } + field_type *new_ft = PyMem_Realloc(*ft, alloc_size); + if (new_ft == NULL) { + field_types_xclear(num_field_types, *ft); + return -1; + } + *ft = new_ft; + } + + Py_INCREF(descr); + (*ft)[num_field_types].descr = descr; + (*ft)[num_field_types].set_from_ucs4 = get_from_ucs4_function(descr); + (*ft)[num_field_types].structured_offset = field_offset; + + return num_field_types + 1; +} + + +/* + * Prepare the "field_types" for the given dtypes/descriptors. Currently, + * we copy the itemsize, but the main thing is that we check for custom + * converters. + */ +NPY_NO_EXPORT npy_intp +field_types_create(PyArray_Descr *descr, field_type **ft) +{ + if (descr->subarray != NULL) { + /* + * This could probably be allowed, but NumPy absorbs the dimensions + * so it is an awkward corner case that probably never really worked. + */ + PyErr_SetString(PyExc_TypeError, + "file reader does not support subarray dtypes. You can" + "put the dtype into a structured one using " + "`np.dtype(('name', dtype))` to avoid this limitation."); + return -1; + } + + npy_intp ft_size = 4; + *ft = PyMem_Malloc(ft_size * sizeof(field_type)); + if (*ft == NULL) { + return -1; + } + return field_type_grow_recursive(descr, 0, ft, &ft_size, 0); +} diff --git a/numpy/core/src/multiarray/textreading/field_types.h b/numpy/core/src/multiarray/textreading/field_types.h new file mode 100644 index 000000000..f26e00a5e --- /dev/null +++ b/numpy/core/src/multiarray/textreading/field_types.h @@ -0,0 +1,67 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ + +#include <stdint.h> +#include <stdbool.h> +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/parser_config.h" + +/** + * Function defining the conversion for each value. + * + * This function must support unaligned memory access. As of now, there is + * no special error handling (in whatever form): We assume that it is always + * reasonable to raise a `ValueError` noting the string that failed to be + * converted. + * + * NOTE: An earlier version of the code had unused default values (pandas + * does this) when columns are missing. We could define this either + * by passing `NULL` in, or by adding a default explicitly somewhere. + * (I think users should probably have to define the default, at which + * point it doesn't matter here.) + * + * NOTE: We are currently passing the parser config, this could be made public + * or could be set up to be dtype specific/private. Always passing + * pconfig fully seems easier right now even if it may change. + * (A future use-case may for example be user-specified strings that are + * considered boolean True or False). + * + * TODO: Aside from nailing down the above notes, it may be nice to expose + * these function publically. This could allow user DTypes to provide + * a converter or custom converters written in C rather than Python. + * + * @param descr The NumPy descriptor of the field (may be byte-swapped, etc.) + * @param str Pointer to the beginning of the UCS4 string to be parsed. + * @param end Pointer to the end of the UCS4 string. This value is currently + * guaranteed to be `\0`, ensuring that parsers can rely on + * nul-termination. + * @param dataptr The pointer where to store the parsed value + * @param pconfig Additional configuration for the parser. + * @returns 0 on success and -1 on failure. If the return value is -1 an + * error may or may not be set. If an error is set, it is chained + * behind the generic ValueError. + */ +typedef int (set_from_ucs4_function)( + PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, + char *dataptr, parser_config *pconfig); + +typedef struct _field_type { + set_from_ucs4_function *set_from_ucs4; + /* The original NumPy descriptor */ + PyArray_Descr *descr; + /* Offset to this entry within row. */ + npy_intp structured_offset; +} field_type; + + +NPY_NO_EXPORT void +field_types_xclear(int num_field_types, field_type *ft); + +NPY_NO_EXPORT npy_intp +field_types_create(PyArray_Descr *descr, field_type **ft); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ */ diff --git a/numpy/core/src/multiarray/textreading/growth.c b/numpy/core/src/multiarray/textreading/growth.c new file mode 100644 index 000000000..49a09d572 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/growth.c @@ -0,0 +1,47 @@ +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "templ_common.h" + +/* + * Helper function taking the size input and growing it (based on min_grow). + * The current scheme is a minimum growth and a general growth by 25% + * overallocation. This is then capped at 2**20 elements, as that propels us + * in the range of large page sizes (so it is presumably more than enough). + * + * It further multiplies it with `itemsize` and ensures that all results fit + * into an `npy_intp`. + * Returns -1 if any overflow occurred or the result would not fit. + * The user has to ensure the input is ssize_t but not negative. + */ +NPY_NO_EXPORT npy_intp +grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize) { + /* min_grow must be a power of two: */ + assert((min_grow & (min_grow - 1)) == 0); + npy_uintp new_size = (npy_uintp)*size; + npy_intp growth = *size >> 2; + if (growth <= min_grow) { + /* can never lead to overflow if we are using min_growth */ + new_size += min_grow; + } + else { + if (growth > 1 << 20) { + /* limit growth to order of MiB (even hugepages are not larger) */ + growth = 1 << 20; + } + new_size += growth + min_grow - 1; + new_size &= ~min_grow; + + if (new_size > NPY_MAX_INTP) { + return -1; + } + } + *size = (npy_intp)new_size; + npy_intp alloc_size; + if (npy_mul_with_overflow_intp(&alloc_size, (npy_intp)new_size, itemsize)) { + return -1; + } + return alloc_size; +} + diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h new file mode 100644 index 000000000..237b77ad3 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/growth.h @@ -0,0 +1,7 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ + +NPY_NO_EXPORT npy_intp +grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ */ diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h new file mode 100644 index 000000000..00e911667 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/parser_config.h @@ -0,0 +1,61 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ + +#include <stdbool.h> + +typedef struct { + /* + * Field delimiter character. + * Typically ',', ' ', '\t', ignored if `delimiter_is_whitespace` is true. + */ + Py_UCS4 delimiter; + + /* + * Character used to quote fields. + * Typically '"' or "'". To disable quoting we set this to UINT_MAX + * (which is not a valid unicode character and thus cannot occur in the + * file; the same is used for all other characters if necessary). + */ + Py_UCS4 quote; + + /* + * Character(s) that indicates the start of a comment. + * Typically '#', '%' or ';'. + * When encountered in a line and not inside quotes, all character + * from the comment character(s) to the end of the line are ignored. + */ + Py_UCS4 comment; + + /* + * Ignore whitespace at the beginning of a field (outside/before quotes). + * Is (and must be) set if `delimiter_is_whitespace`. + */ + bool ignore_leading_whitespace; + + /* + * If true, the delimiter is ignored and any unicode whitespace is used + * for splitting (same as `string.split()` in Python). In that case + * `ignore_leading_whitespace` should also be set. + */ + bool delimiter_is_whitespace; + + /* + * The imaginary unit character. Default is `j`. + */ + Py_UCS4 imaginary_unit; + + /* + * Data should be encoded as `latin1` when using python converter + * (implementing `loadtxt` default Python 2 compatibility mode). + * The c byte converter is used when the user requested `dtype="S"`. + * In this case we go via `dtype=object`, however, loadtxt allows latin1 + * while normal object to string casts only accept ASCII, so it ensures + * that that the object array already contains bytes and not strings. + */ + bool python_byte_converters; + bool c_byte_converters; +} parser_config; + + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ */ diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c new file mode 100644 index 000000000..7af5ee891 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -0,0 +1,312 @@ +#include <stdio.h> +#include <stdbool.h> + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" +#include "npy_argparse.h" +#include "common.h" +#include "conversion_utils.h" + +#include "textreading/parser_config.h" +#include "textreading/stream_pyobject.h" +#include "textreading/field_types.h" +#include "textreading/rows.h" +#include "textreading/str_to_int.h" + + +// +// `usecols` must point to a Python object that is Py_None or a 1-d contiguous +// numpy array with data type int32. +// +// `dtype` must point to a Python object that is Py_None or a numpy dtype +// instance. If the latter, code and sizes must be arrays of length +// num_dtype_fields, holding the flattened data field type codes and byte +// sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype, +// but we do that in Python code.) +// +// If both `usecols` and `dtype` are not None, and the data type is compound, +// then len(usecols) must equal num_dtype_fields. +// +// If `dtype` is given and it is compound, and `usecols` is None, then the +// number of columns in the file must match the number of fields in `dtype`. +// +static PyObject * +_readtext_from_stream(stream *s, + parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[], + Py_ssize_t skiplines, Py_ssize_t max_rows, + PyObject *converters, PyObject *dtype) +{ + PyArrayObject *arr = NULL; + PyArray_Descr *out_dtype = NULL; + field_type *ft = NULL; + + /* + * If dtypes[0] is dtype the input was not structured and the result + * is considered "homogeneous" and we have to discover the number of + * columns/ + */ + out_dtype = (PyArray_Descr *)dtype; + Py_INCREF(out_dtype); + + Py_ssize_t num_fields = field_types_create(out_dtype, &ft); + if (num_fields < 0) { + goto finish; + } + bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype; + + if (!homogeneous && usecols != NULL && num_usecols != num_fields) { + PyErr_Format(PyExc_TypeError, + "If a structured dtype is used, the number of columns in " + "`usecols` must match the effective number of fields. " + "But %zd usecols were given and the number of fields is %zd.", + num_usecols, num_fields); + goto finish; + } + + arr = read_rows( + s, max_rows, num_fields, ft, pc, + num_usecols, usecols, skiplines, converters, + NULL, out_dtype, homogeneous); + if (arr == NULL) { + goto finish; + } + + finish: + Py_XDECREF(out_dtype); + field_types_xclear(num_fields, ft); + return (PyObject *)arr; +} + + +static int +parse_control_character(PyObject *obj, Py_UCS4 *character) +{ + if (obj == Py_None) { + *character = (Py_UCS4)-1; /* character beyond unicode range */ + return 1; + } + if (!PyUnicode_Check(obj) || PyUnicode_GetLength(obj) != 1) { + PyErr_Format(PyExc_TypeError, + "Text reading control character must be a single unicode " + "character or None; but got: %.100R", obj); + return 0; + } + *character = PyUnicode_READ_CHAR(obj, 0); + return 1; +} + + +/* + * A (somewhat verbose) check that none of the control characters match or are + * newline. Most of these combinations are completely fine, just weird or + * surprising. + * (I.e. there is an implicit priority for control characters, so if a comment + * matches a delimiter, it would just be a comment.) + * In theory some `delimiter=None` paths could have a "meaning", but let us + * assume that users are better of setting one of the control chars to `None` + * for clarity. + * + * This also checks that the control characters cannot be newlines. + */ +static int +error_if_matching_control_characters( + Py_UCS4 delimiter, Py_UCS4 quote, Py_UCS4 comment) +{ + char *control_char1; + char *control_char2 = NULL; + if (comment != (Py_UCS4)-1) { + control_char1 = "comment"; + if (comment == '\r' || comment == '\n') { + goto error; + } + else if (comment == quote) { + control_char2 = "quotechar"; + goto error; + } + else if (comment == delimiter) { + control_char2 = "delimiter"; + goto error; + } + } + if (quote != (Py_UCS4)-1) { + control_char1 = "quotechar"; + if (quote == '\r' || quote == '\n') { + goto error; + } + else if (quote == delimiter) { + control_char2 = "delimiter"; + goto error; + } + } + if (delimiter != (Py_UCS4)-1) { + control_char1 = "delimiter"; + if (delimiter == '\r' || delimiter == '\n') { + goto error; + } + } + /* The above doesn't work with delimiter=None, which means "whitespace" */ + if (delimiter == (Py_UCS4)-1) { + control_char1 = "delimiter"; + if (Py_UNICODE_ISSPACE(comment)) { + control_char2 = "comment"; + goto error; + } + else if (Py_UNICODE_ISSPACE(quote)) { + control_char2 = "quotechar"; + goto error; + } + } + return 0; + + error: + if (control_char2 != NULL) { + PyErr_Format(PyExc_TypeError, + "The values for control characters '%s' and '%s' are " + "incompatible", + control_char1, control_char2); + } + else { + PyErr_Format(PyExc_TypeError, + "control character '%s' cannot be a newline (`\\r` or `\\n`).", + control_char1, control_char2); + } + return -1; +} + + +NPY_NO_EXPORT PyObject * +_load_from_filelike(PyObject *NPY_UNUSED(mod), + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) +{ + PyObject *file; + Py_ssize_t skiplines = 0; + Py_ssize_t max_rows = -1; + PyObject *usecols_obj = Py_None; + PyObject *converters = Py_None; + + PyObject *dtype = Py_None; + PyObject *encoding_obj = Py_None; + const char *encoding = NULL; + + parser_config pc = { + .delimiter = ',', + .comment = '#', + .quote = '"', + .imaginary_unit = 'j', + .delimiter_is_whitespace = false, + .ignore_leading_whitespace = false, + .python_byte_converters = false, + .c_byte_converters = false, + }; + bool filelike = true; + + PyObject *arr = NULL; + + NPY_PREPARE_ARGPARSER; + if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames, + "file", NULL, &file, + "|delimiter", &parse_control_character, &pc.delimiter, + "|comment", &parse_control_character, &pc.comment, + "|quote", &parse_control_character, &pc.quote, + "|imaginary_unit", &parse_control_character, &pc.imaginary_unit, + "|usecols", NULL, &usecols_obj, + "|skiplines", &PyArray_IntpFromPyIntConverter, &skiplines, + "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows, + "|converters", NULL, &converters, + "|dtype", NULL, &dtype, + "|encoding", NULL, &encoding_obj, + "|filelike", &PyArray_BoolConverter, &filelike, + "|byte_converters", &PyArray_BoolConverter, &pc.python_byte_converters, + "|c_byte_converters", PyArray_BoolConverter, &pc.c_byte_converters, + NULL, NULL, NULL) < 0) { + return NULL; + } + + /* Reject matching control characters, they just rarely make sense anyway */ + if (error_if_matching_control_characters( + pc.delimiter, pc.quote, pc.comment) < 0) { + return NULL; + } + + if (pc.delimiter == (Py_UCS4)-1) { + pc.delimiter_is_whitespace = true; + /* Ignore leading whitespace to match `string.split(None)` */ + pc.ignore_leading_whitespace = true; + } + + if (!PyArray_DescrCheck(dtype) ) { + PyErr_SetString(PyExc_TypeError, + "internal error: dtype must be provided and be a NumPy dtype"); + return NULL; + } + + if (encoding_obj != Py_None) { + if (!PyUnicode_Check(encoding_obj)) { + PyErr_SetString(PyExc_TypeError, + "encoding must be a unicode string."); + return NULL; + } + encoding = PyUnicode_AsUTF8(encoding_obj); + if (encoding == NULL) { + return NULL; + } + } + + /* + * Parse usecols, the rest of NumPy has no clear helper for this, so do + * it here manually. + */ + Py_ssize_t num_usecols = -1; + Py_ssize_t *usecols = NULL; + if (usecols_obj != Py_None) { + num_usecols = PySequence_Length(usecols_obj); + if (num_usecols < 0) { + return NULL; + } + /* Calloc just to not worry about overflow */ + usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t)); + for (Py_ssize_t i = 0; i < num_usecols; i++) { + PyObject *tmp = PySequence_GetItem(usecols_obj, i); + if (tmp == NULL) { + PyMem_FREE(usecols); + return NULL; + } + usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); + if (error_converting(usecols[i])) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Format(PyExc_TypeError, + "usecols must be an int or a sequence of ints but " + "it contains at least one element of type '%s'", + Py_TYPE(tmp)->tp_name); + } + Py_DECREF(tmp); + PyMem_FREE(usecols); + return NULL; + } + Py_DECREF(tmp); + } + } + + stream *s; + if (filelike) { + s = stream_python_file(file, encoding); + } + else { + s = stream_python_iterable(file, encoding); + } + if (s == NULL) { + PyMem_FREE(usecols); + return NULL; + } + + arr = _readtext_from_stream( + s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype); + stream_close(s); + PyMem_FREE(usecols); + return arr; +} + diff --git a/numpy/core/src/multiarray/textreading/readtext.h b/numpy/core/src/multiarray/textreading/readtext.h new file mode 100644 index 000000000..5cf48c555 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/readtext.h @@ -0,0 +1,7 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ + +NPY_NO_EXPORT PyObject * +_load_from_filelike(PyObject *self, PyObject *args, PyObject *kwargs); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c new file mode 100644 index 000000000..e30ff835e --- /dev/null +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -0,0 +1,481 @@ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" +#include "numpy/npy_3kcompat.h" +#include "alloc.h" + +#include <string.h> +#include <stdbool.h> + +#include "textreading/stream.h" +#include "textreading/tokenize.h" +#include "textreading/conversions.h" +#include "textreading/field_types.h" +#include "textreading/rows.h" +#include "textreading/growth.h" + +/* + * Minimum size to grow the allcoation by (or 25%). The 8KiB means the actual + * growths is within `8 KiB <= size < 16 KiB` (depending on the row size). + */ +#define MIN_BLOCK_SIZE (1 << 13) + + + +/* + * Create the array of converter functions from the Python converters. + */ +static PyObject ** +create_conv_funcs( + PyObject *converters, Py_ssize_t num_fields, const Py_ssize_t *usecols) +{ + assert(converters != Py_None); + + PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *)); + if (conv_funcs == NULL) { + PyErr_NoMemory(); + return NULL; + } + + if (PyCallable_Check(converters)) { + /* a single converter used for all columns individually */ + for (Py_ssize_t i = 0; i < num_fields; i++) { + Py_INCREF(converters); + conv_funcs[i] = converters; + } + return conv_funcs; + } + else if (!PyDict_Check(converters)) { + PyErr_SetString(PyExc_TypeError, + "converters must be a dictionary mapping columns to converter " + "functions or a single callable."); + goto error; + } + + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(converters, &pos, &key, &value)) { + Py_ssize_t column = PyNumber_AsSsize_t(key, PyExc_IndexError); + if (column == -1 && PyErr_Occurred()) { + PyErr_Format(PyExc_TypeError, + "keys of the converters dictionary must be integers; " + "got %.100R", key); + goto error; + } + if (usecols != NULL) { + /* + * This code searches for the corresponding usecol. It is + * identical to the legacy usecols code, which has two weaknesses: + * 1. It fails for duplicated usecols only setting converter for + * the first one. + * 2. It fails e.g. if usecols uses negative indexing and + * converters does not. (This is a feature, since it allows + * us to correctly normalize converters to result column here.) + */ + Py_ssize_t i = 0; + for (; i < num_fields; i++) { + if (column == usecols[i]) { + column = i; + break; + } + } + if (i == num_fields) { + continue; /* ignore unused converter */ + } + } + else { + if (column < -num_fields || column >= num_fields) { + PyErr_Format(PyExc_ValueError, + "converter specified for column %zd, which is invalid " + "for the number of fields %d.", column, num_fields); + goto error; + } + if (column < 0) { + column += num_fields; + } + } + if (!PyCallable_Check(value)) { + PyErr_Format(PyExc_TypeError, + "values of the converters dictionary must be callable, " + "but the value associated with key %R is not", key); + goto error; + } + Py_INCREF(value); + conv_funcs[column] = value; + } + return conv_funcs; + + error: + for (Py_ssize_t i = 0; i < num_fields; i++) { + Py_XDECREF(conv_funcs[i]); + } + PyMem_FREE(conv_funcs); + return NULL; +} + +/** + * Read a file into the provided array, or create (and possibly grow) an + * array to read into. + * + * @param s The stream object/struct providing reading capabilities used by + * the tokenizer. + * @param max_rows The number of rows to read, or -1. If negative + * all rows are read. + * @param num_field_types The number of field types stored in `field_types`. + * @param field_types Information about the dtype for each column (or one if + * `homogeneous`). + * @param pconfig Pointer to the parser config object used by both the + * tokenizer and the conversion functions. + * @param num_usecols The number of columns in `usecols`. + * @param usecols An array of length `num_usecols` or NULL. If given indicates + * which column is read for each individual row (negative columns are + * accepted). + * @param skiplines The number of lines to skip, these lines are ignored. + * @param converters Python dictionary of converters. Finalizing converters + * is difficult without information about the number of columns. + * @param data_array An array to be filled or NULL. In either case a new + * reference is returned (the reference to `data_array` is not stolen). + * @param out_descr The dtype used for allocating a new array. This is not + * used if `data_array` is provided. Note that the actual dtype of the + * returned array can differ for strings. + * @param num_cols Pointer in which the actual (discovered) number of columns + * is returned. This is only relevant if `homogeneous` is true. + * @param homogeneous Whether the datatype of the array is not homogeneous, + * i.e. not structured. In this case the number of columns has to be + * discovered an the returned array will be 2-dimensional rather than + * 1-dimensional. + * + * @returns Returns the result as an array object or NULL on error. The result + * is always a new reference (even when `data_array` was passed in). + */ +NPY_NO_EXPORT PyArrayObject * +read_rows(stream *s, + npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types, + parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, + Py_ssize_t skiplines, PyObject *converters, + PyArrayObject *data_array, PyArray_Descr *out_descr, + bool homogeneous) +{ + char *data_ptr = NULL; + Py_ssize_t current_num_fields; + npy_intp row_size = out_descr->elsize; + PyObject **conv_funcs = NULL; + + bool needs_init = PyDataType_FLAGCHK(out_descr, NPY_NEEDS_INIT); + + int ndim = homogeneous ? 2 : 1; + npy_intp result_shape[2] = {0, 1}; + + bool data_array_allocated = data_array == NULL; + /* Make sure we own `data_array` for the purpose of error handling */ + Py_XINCREF(data_array); + size_t rows_per_block = 1; /* will be increased depending on row size */ + npy_intp data_allocated_rows = 0; + + /* We give a warning if max_rows is used and an empty line is encountered */ + bool give_empty_row_warning = max_rows >= 0; + + int ts_result = 0; + tokenizer_state ts; + if (tokenizer_init(&ts, pconfig) < 0) { + goto error; + } + + /* Set the actual number of fields if it is already known, otherwise -1 */ + Py_ssize_t actual_num_fields = -1; + if (usecols != NULL) { + assert(homogeneous || num_field_types == num_usecols); + actual_num_fields = num_usecols; + } + else if (!homogeneous) { + assert(usecols == NULL || num_field_types == num_usecols); + actual_num_fields = num_field_types; + } + + for (Py_ssize_t i = 0; i < skiplines; i++) { + ts.state = TOKENIZE_GOTO_LINE_END; + ts_result = tokenize(s, &ts, pconfig); + if (ts_result < 0) { + goto error; + } + else if (ts_result != 0) { + /* Fewer lines than skiplines is acceptable */ + break; + } + } + + Py_ssize_t row_count = 0; /* number of rows actually processed */ + while ((max_rows < 0 || row_count < max_rows) && ts_result == 0) { + ts_result = tokenize(s, &ts, pconfig); + if (ts_result < 0) { + goto error; + } + current_num_fields = ts.num_fields; + field_info *fields = ts.fields; + if (NPY_UNLIKELY(ts.num_fields == 0)) { + /* + * Deprecated NumPy 1.23, 2021-01-13 (not really a deprecation, + * but similar policy should apply to removing the warning again) + */ + /* Tokenizer may give a final "empty line" even if there is none */ + if (give_empty_row_warning && ts_result == 0) { + give_empty_row_warning = false; + if (PyErr_WarnFormat(PyExc_UserWarning, 3, + "Input line %zd contained no data and will not be " + "counted towards `max_rows=%zd`. This differs from " + "the behaviour in NumPy <=1.22 which counted lines " + "rather than rows. If desired, the previous behaviour " + "can be achieved by using `itertools.islice`.\n" + "Please see the 1.23 release notes for an example on " + "how to do this. If you wish to ignore this warning, " + "use `warnings.filterwarnings`. This warning is " + "expected to be removed in the future and is given " + "only once per `loadtxt` call.", + row_count + skiplines + 1, max_rows) < 0) { + goto error; + } + } + continue; /* Ignore empty line */ + } + + if (NPY_UNLIKELY(data_ptr == NULL)) { + // We've deferred some of the initialization tasks to here, + // because we've now read the first line, and we definitively + // know how many fields (i.e. columns) we will be processing. + if (actual_num_fields == -1) { + actual_num_fields = current_num_fields; + } + + if (converters != Py_None) { + conv_funcs = create_conv_funcs( + converters, actual_num_fields, usecols); + if (conv_funcs == NULL) { + goto error; + } + } + + /* Note that result_shape[1] is only used if homogeneous is true */ + result_shape[1] = actual_num_fields; + if (homogeneous) { + row_size *= actual_num_fields; + } + + if (data_array == NULL) { + if (max_rows < 0) { + /* + * Negative max_rows denotes to read the whole file, we + * approach this by allocating ever larger blocks. + * Adds a number of rows based on `MIN_BLOCK_SIZE`. + * Note: later code grows assuming this is a power of two. + */ + if (row_size == 0) { + /* actual rows_per_block should not matter here */ + rows_per_block = 512; + } + else { + /* safe on overflow since min_rows will be 0 or 1 */ + size_t min_rows = ( + (MIN_BLOCK_SIZE + row_size - 1) / row_size); + while (rows_per_block < min_rows) { + rows_per_block *= 2; + } + } + data_allocated_rows = rows_per_block; + } + else { + data_allocated_rows = max_rows; + } + result_shape[0] = data_allocated_rows; + Py_INCREF(out_descr); + /* + * We do not use Empty, as it would fill with None + * and requiring decref'ing if we shrink again. + */ + data_array = (PyArrayObject *)PyArray_SimpleNewFromDescr( + ndim, result_shape, out_descr); +#ifdef NPY_RELAXED_STRIDES_DEBUG + /* Incompatible with NPY_RELAXED_STRIDES_DEBUG due to growing */ + if (result_shape[0] == 1) { + PyArray_STRIDES(data_array)[0] = row_size; + } +#endif /* NPY_RELAXED_STRIDES_DEBUG */ + if (data_array == NULL) { + goto error; + } + if (needs_init) { + memset(PyArray_BYTES(data_array), 0, PyArray_NBYTES(data_array)); + } + } + else { + assert(max_rows >=0); + data_allocated_rows = max_rows; + } + data_ptr = PyArray_BYTES(data_array); + } + + if (!usecols && (actual_num_fields != current_num_fields)) { + PyErr_Format(PyExc_ValueError, + "the number of columns changed from %d to %d at row %zu; " + "use `usecols` to select a subset and avoid this error", + actual_num_fields, current_num_fields, row_count+1); + goto error; + } + + if (NPY_UNLIKELY(data_allocated_rows == row_count)) { + /* + * Grow by ~25% and rounded up to the next rows_per_block + * NOTE: This is based on very crude timings and could be refined! + */ + npy_intp new_rows = data_allocated_rows; + npy_intp alloc_size = grow_size_and_multiply( + &new_rows, rows_per_block, row_size); + if (alloc_size < 0) { + /* should normally error much earlier, but make sure */ + PyErr_SetString(PyExc_ValueError, + "array is too big. Cannot read file as a single array; " + "providing a maximum number of rows to read may help."); + goto error; + } + + char *new_data = PyDataMem_UserRENEW( + PyArray_BYTES(data_array), alloc_size ? alloc_size : 1, + PyArray_HANDLER(data_array)); + if (new_data == NULL) { + PyErr_NoMemory(); + goto error; + } + /* Replace the arrays data since it may have changed */ + ((PyArrayObject_fields *)data_array)->data = new_data; + ((PyArrayObject_fields *)data_array)->dimensions[0] = new_rows; + data_ptr = new_data + row_count * row_size; + data_allocated_rows = new_rows; + if (needs_init) { + memset(data_ptr, '\0', (new_rows - row_count) * row_size); + } + } + + for (Py_ssize_t i = 0; i < actual_num_fields; ++i) { + Py_ssize_t f; /* The field, either 0 (if homogeneous) or i. */ + Py_ssize_t col; /* The column as read, remapped by usecols */ + char *item_ptr; + if (homogeneous) { + f = 0; + item_ptr = data_ptr + i * field_types[0].descr->elsize; + } + else { + f = i; + item_ptr = data_ptr + field_types[f].structured_offset; + } + + if (usecols == NULL) { + col = i; + } + else { + col = usecols[i]; + if (col < 0) { + // Python-like column indexing: k = -1 means the last column. + col += current_num_fields; + } + if (NPY_UNLIKELY((col < 0) || (col >= current_num_fields))) { + PyErr_Format(PyExc_ValueError, + "invalid column index %d at row %zu with %d " + "columns", + usecols[i], current_num_fields, row_count+1); + goto error; + } + } + + /* + * The following function calls represent the main "conversion" + * step, i.e. parsing the unicode string for each field and storing + * the result in the array. + */ + int parser_res; + Py_UCS4 *str = ts.field_buffer + fields[col].offset; + Py_UCS4 *end = ts.field_buffer + fields[col + 1].offset - 1; + if (conv_funcs == NULL || conv_funcs[i] == NULL) { + parser_res = field_types[f].set_from_ucs4(field_types[f].descr, + str, end, item_ptr, pconfig); + } + else { + parser_res = to_generic_with_converter(field_types[f].descr, + str, end, item_ptr, pconfig, conv_funcs[i]); + } + + if (NPY_UNLIKELY(parser_res < 0)) { + PyObject *exc, *val, *tb; + PyErr_Fetch(&exc, &val, &tb); + + size_t length = end - str; + PyObject *string = PyUnicode_FromKindAndData( + PyUnicode_4BYTE_KIND, str, length); + if (string == NULL) { + npy_PyErr_ChainExceptions(exc, val, tb); + goto error; + } + PyErr_Format(PyExc_ValueError, + "could not convert string %.100R to %S at " + "row %zu, column %d.", + string, field_types[f].descr, row_count, col+1); + Py_DECREF(string); + npy_PyErr_ChainExceptionsCause(exc, val, tb); + goto error; + } + } + + ++row_count; + data_ptr += row_size; + } + + tokenizer_clear(&ts); + PyMem_FREE(conv_funcs); + + if (data_array == NULL) { + assert(row_count == 0 && result_shape[0] == 0); + if (actual_num_fields == -1) { + /* + * We found no rows and have to discover the number of elements + * we have no choice but to guess 1. + * NOTE: It may make sense to move this outside of here to refine + * the behaviour where necessary. + */ + result_shape[1] = 1; + } + else { + result_shape[1] = actual_num_fields; + } + Py_INCREF(out_descr); + data_array = (PyArrayObject *)PyArray_Empty( + ndim, result_shape, out_descr, 0); + } + + /* + * Note that if there is no data, `data_array` may still be NULL and + * row_count is 0. In that case, always realloc just in case. + */ + if (data_array_allocated && data_allocated_rows != row_count) { + size_t size = row_count * row_size; + char *new_data = PyDataMem_UserRENEW( + PyArray_BYTES(data_array), size ? size : 1, + PyArray_HANDLER(data_array)); + if (new_data == NULL) { + Py_DECREF(data_array); + PyErr_NoMemory(); + return NULL; + } + ((PyArrayObject_fields *)data_array)->data = new_data; + ((PyArrayObject_fields *)data_array)->dimensions[0] = row_count; + } + + return data_array; + + error: + PyMem_FREE(conv_funcs); + tokenizer_clear(&ts); + Py_XDECREF(data_array); + return NULL; +} diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h new file mode 100644 index 000000000..20eb9e186 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/rows.h @@ -0,0 +1,22 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> +#include <stdio.h> + +#include "textreading/stream.h" +#include "textreading/field_types.h" +#include "textreading/parser_config.h" + + +NPY_NO_EXPORT PyArrayObject * +read_rows(stream *s, + npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types, + parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, + Py_ssize_t skiplines, PyObject *converters, + PyArrayObject *data_array, PyArray_Descr *out_descr, + bool homogeneous); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ */ diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c new file mode 100644 index 000000000..11b03e31c --- /dev/null +++ b/numpy/core/src/multiarray/textreading/str_to_int.c @@ -0,0 +1,67 @@ + +#include <Python.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "lowlevel_strided_loops.h" + +#include <string.h> +#include "textreading/str_to_int.h" +#include "textreading/parser_config.h" + + +#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX, byteswap_unaligned) \ + NPY_NO_EXPORT int \ + to_##intw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + int64_t parsed; \ + intw##_t x; \ + \ + if (str_to_int64(str, end, INT_MIN, INT_MAX, &parsed) < 0) { \ + return -1; \ + } \ + else { \ + x = (intw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + byteswap_unaligned(dataptr); \ + } \ + return 0; \ + } + +#define DECLARE_TO_UINT(uintw, UINT_MAX, byteswap_unaligned) \ + NPY_NO_EXPORT int \ + to_##uintw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + uint64_t parsed; \ + uintw##_t x; \ + \ + if (str_to_uint64(str, end, UINT_MAX, &parsed) < 0) { \ + return -1; \ + } \ + else { \ + x = (uintw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + byteswap_unaligned(dataptr); \ + } \ + return 0; \ + } + +#define byteswap_nothing(ptr) + +DECLARE_TO_INT(int8, INT8_MIN, INT8_MAX, byteswap_nothing) +DECLARE_TO_INT(int16, INT16_MIN, INT16_MAX, npy_bswap2_unaligned) +DECLARE_TO_INT(int32, INT32_MIN, INT32_MAX, npy_bswap4_unaligned) +DECLARE_TO_INT(int64, INT64_MIN, INT64_MAX, npy_bswap8_unaligned) + +DECLARE_TO_UINT(uint8, UINT8_MAX, byteswap_nothing) +DECLARE_TO_UINT(uint16, UINT16_MAX, npy_bswap2_unaligned) +DECLARE_TO_UINT(uint32, UINT32_MAX, npy_bswap4_unaligned) +DECLARE_TO_UINT(uint64, UINT64_MAX, npy_bswap8_unaligned) diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h new file mode 100644 index 000000000..a0a89a0ef --- /dev/null +++ b/numpy/core/src/multiarray/textreading/str_to_int.h @@ -0,0 +1,174 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/parser_config.h" + + +/* + * The following two string conversion functions are largely equivalent + * in Pandas. They are in the header file here, to ensure they can be easily + * inline in the other function. + * Unlike pandas, pass in end-pointer (do not rely on \0) and return 0 or -1. + * + * The actual functions are defined using macro templating below. + */ +NPY_FINLINE int +str_to_int64( + const Py_UCS4 *p_item, const Py_UCS4 *p_end, + int64_t int_min, int64_t int_max, int64_t *result) +{ + const Py_UCS4 *p = (const Py_UCS4 *)p_item; + bool isneg = 0; + int64_t number = 0; + + // Skip leading spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + isneg = true; + ++p; + } + else if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + return -1; + } + + if (isneg) { + // If number is greater than pre_min, at least one more digit + // can be processed without overflowing. + int dig_pre_min = -(int_min % 10); + int64_t pre_min = int_min / 10; + + // Process the digits. + int d = *p; + while (isdigit(d)) { + if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { + number = number * 10 - (d - '0'); + d = *++p; + } + else { + return -1; + } + } + } + else { + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + int64_t pre_max = int_max / 10; + int dig_pre_max = int_max % 10; + + // Process the digits. + int d = *p; + while (isdigit(d)) { + if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } + else { + return -1; + } + } + } + + // Skip trailing spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Did we use up all the characters? + if (p != p_end) { + return -1; + } + + *result = number; + return 0; +} + + +NPY_FINLINE int +str_to_uint64( + const Py_UCS4 *p_item, const Py_UCS4 *p_end, + uint64_t uint_max, uint64_t *result) +{ + const Py_UCS4 *p = (const Py_UCS4 *)p_item; + uint64_t number = 0; + int d; + + // Skip leading spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + return -1; + } + if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + return -1; + } + + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + uint64_t pre_max = uint_max / 10; + int dig_pre_max = uint_max % 10; + + // Process the digits. + d = *p; + while (isdigit(d)) { + if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } + else { + return -1; + } + } + + // Skip trailing spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Did we use up all the characters? + if (p != p_end) { + return -1; + } + + *result = number; + return 0; +} + + +#define DECLARE_TO_INT_PROTOTYPE(intw) \ + NPY_NO_EXPORT int \ + to_##intw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig); + +DECLARE_TO_INT_PROTOTYPE(int8) +DECLARE_TO_INT_PROTOTYPE(int16) +DECLARE_TO_INT_PROTOTYPE(int32) +DECLARE_TO_INT_PROTOTYPE(int64) + +DECLARE_TO_INT_PROTOTYPE(uint8) +DECLARE_TO_INT_PROTOTYPE(uint16) +DECLARE_TO_INT_PROTOTYPE(uint32) +DECLARE_TO_INT_PROTOTYPE(uint64) + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h new file mode 100644 index 000000000..59bd14074 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -0,0 +1,41 @@ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ + +#include <stdint.h> + +/* + * When getting the next line, we hope that the buffer provider can already + * give some information about the newlines, because for Python iterables + * we definitely expect to get line-by-line buffers. + * + * BUFFER_IS_FILEEND must be returned when the end of the file is reached and + * must NOT be returned together with a valid (non-empty) buffer. + */ +#define BUFFER_MAY_CONTAIN_NEWLINE 0 +#define BUFFER_IS_LINEND 1 +#define BUFFER_IS_FILEEND 2 + +/* + * Base struct for streams. We currently have two, a chunked reader for + * filelikes and a line-by-line for any iterable. + * As of writing, the chunked reader was only used for filelikes not already + * opened. That is to preserve the amount read in case of an error exactly. + * If we drop this, we could read it more often (but not when `max_rows` is + * used). + * + * The "streams" can extend this struct to store their own data (so it is + * a very lightweight "object"). + */ +typedef struct _stream { + int (*stream_nextbuf)(void *sdata, char **start, char **end, int *kind); + // Note that the first argument to stream_close is the stream pointer + // itself, not the stream_data pointer. + int (*stream_close)(struct _stream *strm); +} stream; + + +#define stream_nextbuf(s, start, end, kind) \ + ((s)->stream_nextbuf((s), start, end, kind)) +#define stream_close(s) ((s)->stream_close((s))) + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ */ diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c new file mode 100644 index 000000000..6f84ff01d --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c @@ -0,0 +1,239 @@ +/* + * C side structures to provide capabilities to read Python file like objects + * in chunks, or iterate through iterables with each result representing a + * single line of a file. + */ + +#include <stdio.h> +#include <stdlib.h> + +#define PY_SSIZE_T_CLEAN +#include <Python.h> +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" + +#include "textreading/stream.h" + +#define READ_CHUNKSIZE 1 << 14 + + +typedef struct { + stream stream; + /* The Python file object being read. */ + PyObject *file; + + /* The `read` attribute of the file object. */ + PyObject *read; + /* Amount to read each time we call `obj.read()` */ + PyObject *chunksize; + + /* Python str object holding the line most recently read from the file. */ + PyObject *chunk; + + /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ + const char *encoding; +} python_chunks_from_file; + + +/* + * Helper function to support byte objects as well as unicode strings. + * + * NOTE: Steals a reference to `str` (although usually returns it unmodified). + */ +static NPY_INLINE PyObject * +process_stringlike(PyObject *str, const char *encoding) +{ + if (PyBytes_Check(str)) { + PyObject *ustr; + ustr = PyUnicode_FromEncodedObject(str, encoding, NULL); + if (ustr == NULL) { + return NULL; + } + Py_DECREF(str); + return ustr; + } + else if (!PyUnicode_Check(str)) { + PyErr_SetString(PyExc_TypeError, + "non-string returned while reading data"); + Py_DECREF(str); + return NULL; + } + return str; +} + + +static NPY_INLINE void +buffer_info_from_unicode(PyObject *str, char **start, char **end, int *kind) +{ + Py_ssize_t length = PyUnicode_GET_LENGTH(str); + *kind = PyUnicode_KIND(str); + + if (*kind == PyUnicode_1BYTE_KIND) { + *start = (char *)PyUnicode_1BYTE_DATA(str); + } + else if (*kind == PyUnicode_2BYTE_KIND) { + *start = (char *)PyUnicode_2BYTE_DATA(str); + length *= sizeof(Py_UCS2); + } + else if (*kind == PyUnicode_4BYTE_KIND) { + *start = (char *)PyUnicode_4BYTE_DATA(str); + length *= sizeof(Py_UCS4); + } + *end = *start + length; +} + + +static int +fb_nextbuf(python_chunks_from_file *fb, char **start, char **end, int *kind) +{ + Py_XDECREF(fb->chunk); + fb->chunk = NULL; + + PyObject *chunk = PyObject_CallFunctionObjArgs(fb->read, fb->chunksize, NULL); + if (chunk == NULL) { + return -1; + } + fb->chunk = process_stringlike(chunk, fb->encoding); + if (fb->chunk == NULL) { + return -1; + } + buffer_info_from_unicode(fb->chunk, start, end, kind); + if (*start == *end) { + return BUFFER_IS_FILEEND; + } + return BUFFER_MAY_CONTAIN_NEWLINE; +} + + +static int +fb_del(stream *strm) +{ + python_chunks_from_file *fb = (python_chunks_from_file *)strm; + + Py_XDECREF(fb->file); + Py_XDECREF(fb->read); + Py_XDECREF(fb->chunksize); + Py_XDECREF(fb->chunk); + + PyMem_FREE(strm); + + return 0; +} + + +NPY_NO_EXPORT stream * +stream_python_file(PyObject *obj, const char *encoding) +{ + python_chunks_from_file *fb; + + fb = (python_chunks_from_file *)PyMem_Calloc(1, sizeof(python_chunks_from_file)); + if (fb == NULL) { + PyErr_NoMemory(); + return NULL; + } + + fb->stream.stream_nextbuf = (void *)&fb_nextbuf; + fb->stream.stream_close = &fb_del; + + fb->encoding = encoding; + Py_INCREF(obj); + fb->file = obj; + + fb->read = PyObject_GetAttrString(obj, "read"); + if (fb->read == NULL) { + goto fail; + } + fb->chunksize = PyLong_FromLong(READ_CHUNKSIZE); + if (fb->chunksize == NULL) { + goto fail; + } + + return (stream *)fb; + +fail: + fb_del((stream *)fb); + return NULL; +} + + +/* + * Stream from a Python iterable by interpreting each item as a line in a file + */ +typedef struct { + stream stream; + /* The Python file object being read. */ + PyObject *iterator; + + /* Python str object holding the line most recently fetched */ + PyObject *line; + + /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ + const char *encoding; +} python_lines_from_iterator; + + +static int +it_del(stream *strm) +{ + python_lines_from_iterator *it = (python_lines_from_iterator *)strm; + + Py_XDECREF(it->iterator); + Py_XDECREF(it->line); + + PyMem_FREE(strm); + return 0; +} + + +static int +it_nextbuf(python_lines_from_iterator *it, char **start, char **end, int *kind) +{ + Py_XDECREF(it->line); + it->line = NULL; + + PyObject *line = PyIter_Next(it->iterator); + if (line == NULL) { + if (PyErr_Occurred()) { + return -1; + } + *start = NULL; + *end = NULL; + return BUFFER_IS_FILEEND; + } + it->line = process_stringlike(line, it->encoding); + if (it->line == NULL) { + return -1; + } + + buffer_info_from_unicode(it->line, start, end, kind); + return BUFFER_IS_LINEND; +} + + +NPY_NO_EXPORT stream * +stream_python_iterable(PyObject *obj, const char *encoding) +{ + python_lines_from_iterator *it; + + if (!PyIter_Check(obj)) { + PyErr_SetString(PyExc_TypeError, + "error reading from object, expected an iterable."); + return NULL; + } + + it = (python_lines_from_iterator *)PyMem_Calloc(1, sizeof(*it)); + if (it == NULL) { + PyErr_NoMemory(); + return NULL; + } + + it->stream.stream_nextbuf = (void *)&it_nextbuf; + it->stream.stream_close = &it_del; + + it->encoding = encoding; + Py_INCREF(obj); + it->iterator = obj; + + return (stream *)it; +} diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.h b/numpy/core/src/multiarray/textreading/stream_pyobject.h new file mode 100644 index 000000000..45c11dd95 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.h @@ -0,0 +1,16 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#include "textreading/stream.h" + +NPY_NO_EXPORT stream * +stream_python_file(PyObject *obj, const char *encoding); + +NPY_NO_EXPORT stream * +stream_python_iterable(PyObject *obj, const char *encoding); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src new file mode 100644 index 000000000..6ddba3345 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -0,0 +1,457 @@ + +#include <Python.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/stream.h" +#include "textreading/tokenize.h" +#include "textreading/parser_config.h" +#include "textreading/growth.h" + + +/* + How parsing quoted fields works: + + For quoting to be activated, the first character of the field + must be the quote character (after taking into account + ignore_leading_spaces). While quoting is active, delimiters + are treated as regular characters, not delimiters. Quoting is + deactivated by the second occurrence of the quote character. An + exception is the occurrence of two consecutive quote characters, + which is treated as a literal occurrence of a single quote character. + E.g. (with delimiter=',' and quote='"'): + 12.3,"New York, NY","3'2""" + The second and third fields are `New York, NY` and `3'2"`. + + If a non-delimiter occurs after the closing quote, the quote is + ignored and parsing continues with quoting deactivated. Quotes + that occur while quoting is not activated are not handled specially; + they become part of the data. + E.g: + 12.3,"ABC"DEF,XY"Z + The second and third fields are `ABCDEF` and `XY"Z`. + + Note that the second field of + 12.3,"ABC" ,4.5 + is `ABC `. Currently there is no option to ignore whitespace + at the end of a field. +*/ + + +/**begin repeat + * #type = Py_UCS1, Py_UCS2, Py_UCS4# + */ +static NPY_INLINE int +copy_to_field_buffer_@type@(tokenizer_state *ts, + const @type@ *chunk_start, const @type@ *chunk_end) +{ + npy_intp chunk_length = chunk_end - chunk_start; + npy_intp size = chunk_length + ts->field_buffer_pos + 2; + + if (NPY_UNLIKELY(ts->field_buffer_length < size)) { + npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4)); + if (alloc_size < 0) { + PyErr_Format(PyExc_ValueError, + "line too long to handle while reading file."); + return -1; + } + Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size); + if (grown == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->field_buffer_length = size; + ts->field_buffer = grown; + } + + Py_UCS4 *write_pos = ts->field_buffer + ts->field_buffer_pos; + for (; chunk_start < chunk_end; chunk_start++, write_pos++) { + *write_pos = (Py_UCS4)*chunk_start; + } + *write_pos = '\0'; /* always ensure we end with NUL */ + ts->field_buffer_pos += chunk_length; + return 0; +} +/**end repeat**/ + + +static NPY_INLINE int +add_field(tokenizer_state *ts) +{ + /* The previous field is done, advance to keep a NUL byte at the end */ + ts->field_buffer_pos += 1; + + if (NPY_UNLIKELY(ts->num_fields + 1 > ts->fields_size)) { + npy_intp size = ts->num_fields; + + npy_intp alloc_size = grow_size_and_multiply( + &size, 4, sizeof(field_info)); + if (alloc_size < 0) { + /* Check for a size overflow, path should be almost impossible. */ + PyErr_Format(PyExc_ValueError, + "too many columns found; cannot read file."); + return -1; + } + field_info *fields = PyMem_Realloc(ts->fields, alloc_size); + if (fields == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->fields = fields; + ts->fields_size = size; + } + + ts->fields[ts->num_fields].offset = ts->field_buffer_pos; + ts->fields[ts->num_fields].quoted = false; + ts->num_fields += 1; + /* Ensure this (currently empty) word is NUL terminated. */ + ts->field_buffer[ts->field_buffer_pos] = '\0'; + return 0; +} + + +/**begin repeat + * #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND# + * #type = Py_UCS1, Py_UCS2, Py_UCS4# + */ +static NPY_INLINE int +tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) +{ + @type@ *pos = (@type@ *)ts->pos; + @type@ *stop = (@type@ *)ts->end; + @type@ *chunk_start; + + if (ts->state == TOKENIZE_CHECK_QUOTED) { + /* before we can check for quotes, strip leading whitespace */ + if (config->ignore_leading_whitespace) { + while (pos < stop && Py_UNICODE_ISSPACE(*pos) && + *pos != '\r' && *pos != '\n') { + pos++; + } + if (pos == stop) { + ts->pos = (char *)pos; + return 0; + } + } + + /* Setting chunk effectively starts the field */ + if (*pos == config->quote) { + ts->fields[ts->num_fields - 1].quoted = true; + ts->state = TOKENIZE_QUOTED; + pos++; /* TOKENIZE_QUOTED is OK with pos == stop */ + } + else { + /* Set to TOKENIZE_QUOTED or TOKENIZE_QUOTED_WHITESPACE */ + ts->state = ts->unquoted_state; + } + } + + switch (ts->state) { + case TOKENIZE_UNQUOTED: + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + else if (*pos == config->delimiter) { + ts->state = TOKENIZE_INIT; + break; + } + else if (*pos == config->comment) { + ts->state = TOKENIZE_GOTO_LINE_END; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_UNQUOTED_WHITESPACE: + /* Note, this branch is largely identical to `TOKENIZE_UNQUOTED` */ + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + else if (Py_UNICODE_ISSPACE(*pos)) { + ts->state = TOKENIZE_INIT; + break; + } + else if (*pos == config->comment) { + ts->state = TOKENIZE_GOTO_LINE_END; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_QUOTED: + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == config->quote) { + ts->state = TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE: + if (*pos == config->quote) { + /* Copy the quote character directly from the config: */ + if (copy_to_field_buffer_Py_UCS4(ts, + &config->quote, &config->quote+1) < 0) { + return -1; + } + ts->state = TOKENIZE_QUOTED; + pos++; + } + else { + /* continue parsing as if unquoted */ + ts->state = TOKENIZE_UNQUOTED; + } + break; + + case TOKENIZE_GOTO_LINE_END: + if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) { + pos = stop; /* advance to next buffer */ + ts->state = TOKENIZE_LINE_END; + break; + } + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + } + pos++; + break; + + case TOKENIZE_EAT_CRLF: + /* "Universal newline" support: remove \n in \r\n. */ + if (*pos == '\n') { + pos++; + } + ts->state = TOKENIZE_LINE_END; + break; + + default: + assert(0); + } + + ts->pos = (char *)pos; + return 0; +} +/**end repeat**/ + + +/* + * This tokenizer always copies the full "row" (all tokens). This makes + * two things easier: + * 1. It means that every word is guaranteed to be followed by a NUL character + * (although it can include one as well). + * 2. If usecols are used we can sniff the first row easier by parsing it + * fully. Further, usecols can be negative so we may not know which row we + * need up-front. + * + * The tokenizer could grow the ability to skip fields and check the + * maximum number of fields when known, it is unclear that this is worthwhile. + * + * Unlike some tokenizers, this one tries to work in chunks and copies + * data in chunks as well. The hope is that this makes multiple light-weight + * loops rather than a single heavy one, to allow e.g. quickly scanning for the + * end of a field. Copying chunks also means we usually only check once per + * field whether the buffer is large enough. + * Different choices are possible, this one seems to work well, though. + * + * The core (main part) of the tokenizer is specialized for the three Python + * unicode flavors UCS1, UCS2, and UCS4 as a worthwhile optimization. + */ +NPY_NO_EXPORT int +tokenize(stream *s, tokenizer_state *ts, parser_config *const config) +{ + assert(ts->fields_size >= 2); + assert(ts->field_buffer_length >= 2*sizeof(Py_UCS4)); + + int finished_reading_file = 0; + + /* Reset to start of buffer */ + ts->field_buffer_pos = 0; + ts->num_fields = 0; + + while (1) { + /* + * This loop adds new fields to the result (to make up a full row) + * until the row ends (typically a line end or the file end) + */ + if (ts->state == TOKENIZE_INIT) { + /* Start a new field */ + if (add_field(ts) < 0) { + return -1; + } + ts->state = TOKENIZE_CHECK_QUOTED; + } + + if (NPY_UNLIKELY(ts->pos >= ts->end)) { + if (ts->buf_state == BUFFER_IS_LINEND && + ts->state != TOKENIZE_QUOTED) { + /* + * Finished line, do not read anymore (also do not eat \n). + * If we are in a quoted field and the "line" does not end with + * a newline, the quoted field will not have it either. + * I.e. `np.loadtxt(['"a', 'b"'], dtype="S2", quotechar='"')` + * reads "ab". This matches `next(csv.reader(['"a', 'b"']))`. + */ + break; + } + /* fetch new data */ + ts->buf_state = stream_nextbuf(s, + &ts->pos, &ts->end, &ts->unicode_kind); + if (ts->buf_state < 0) { + return -1; + } + if (ts->buf_state == BUFFER_IS_FILEEND) { + finished_reading_file = 1; + ts->pos = ts->end; /* stream should ensure this. */ + break; + } + else if (ts->pos == ts->end) { + /* This must be an empty line (and it must be indicated!). */ + assert(ts->buf_state == BUFFER_IS_LINEND); + break; + } + } + int status; + if (ts->unicode_kind == PyUnicode_1BYTE_KIND) { + status = tokenizer_core_Py_UCS1(ts, config); + } + else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) { + status = tokenizer_core_Py_UCS2(ts, config); + } + else { + assert(ts->unicode_kind == PyUnicode_4BYTE_KIND); + status = tokenizer_core_Py_UCS4(ts, config); + } + if (status < 0) { + return -1; + } + + if (ts->state == TOKENIZE_LINE_END) { + break; + } + } + + /* + * We have finished tokenizing a full row into fields, finalize result + */ + if (ts->buf_state == BUFFER_IS_LINEND) { + /* This line is "finished", make sure we don't touch it again: */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; + if (NPY_UNLIKELY(ts->pos < ts->end)) { + PyErr_SetString(PyExc_ValueError, + "Found an unquoted embedded newline within a single line of " + "input. This is currently not supported."); + return -1; + } + } + + /* Finish the last field (we "append" one to store the last ones length) */ + if (add_field(ts) < 0) { + return -1; + } + ts->num_fields -= 1; + + /* + * If have one field, but that field is completely empty, this is an + * empty line, and we just ignore it. + */ + if (ts->num_fields == 1 + && ts->fields[1].offset - ts->fields[0].offset == 1 + && !ts->fields->quoted) { + ts->num_fields--; + } + ts->state = TOKENIZE_INIT; + return finished_reading_file; +} + + +NPY_NO_EXPORT void +tokenizer_clear(tokenizer_state *ts) +{ + PyMem_FREE(ts->field_buffer); + ts->field_buffer = NULL; + ts->field_buffer_length = 0; + + PyMem_FREE(ts->fields); + ts->fields = NULL; + ts->fields_size = 0; +} + + +/* + * Initialize the tokenizer. We may want to copy all important config + * variables into the tokenizer. This would improve the cache locality during + * tokenizing. + */ +NPY_NO_EXPORT int +tokenizer_init(tokenizer_state *ts, parser_config *config) +{ + /* State and buf_state could be moved into tokenize if we go by row */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; + ts->state = TOKENIZE_INIT; + if (config->delimiter_is_whitespace) { + ts->unquoted_state = TOKENIZE_UNQUOTED_WHITESPACE; + } + else { + ts->unquoted_state = TOKENIZE_UNQUOTED; + } + ts->num_fields = 0; + + ts->buf_state = 0; + ts->pos = NULL; + ts->end = NULL; + + ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4)); + if (ts->field_buffer == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->field_buffer_length = 32; + + ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields)); + if (ts->fields == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->fields_size = 4; + return 0; +} diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h new file mode 100644 index 000000000..fa10bb9b0 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -0,0 +1,78 @@ + +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ + +#include <Python.h> +#include "numpy/ndarraytypes.h" + +#include "textreading/stream.h" +#include "textreading/parser_config.h" + + +typedef enum { + /* Initialization of fields */ + TOKENIZE_INIT, + TOKENIZE_CHECK_QUOTED, + /* Main field parsing states */ + TOKENIZE_UNQUOTED, + TOKENIZE_UNQUOTED_WHITESPACE, + TOKENIZE_QUOTED, + /* Handling of two character control sequences (except "\r\n") */ + TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE, + /* Line end handling */ + TOKENIZE_LINE_END, + TOKENIZE_EAT_CRLF, /* "\r\n" support (carriage return, line feed) */ + TOKENIZE_GOTO_LINE_END, +} tokenizer_parsing_state; + + +typedef struct { + size_t offset; + bool quoted; +} field_info; + + +typedef struct { + tokenizer_parsing_state state; + /* Either TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE: */ + tokenizer_parsing_state unquoted_state; + int unicode_kind; + int buf_state; + /* the buffer we are currently working on */ + char *pos; + char *end; + /* + * Space to copy words into. The buffer must always be at least two NUL + * entries longer (8 bytes) than the actual word (including initially). + * The first byte beyond the current word is always NUL'ed on write, the + * second byte is there to allow easy appending of an additional empty + * word at the end (this word is also NUL terminated). + */ + npy_intp field_buffer_length; + npy_intp field_buffer_pos; + Py_UCS4 *field_buffer; + + /* + * Fields, including information about the field being quoted. This + * always includes one "additional" empty field. The length of a field + * is equal to `fields[i+1].offset - fields[i].offset - 1`. + * + * The tokenizer assumes at least one field is allocated. + */ + npy_intp num_fields; + npy_intp fields_size; + field_info *fields; +} tokenizer_state; + + +NPY_NO_EXPORT void +tokenizer_clear(tokenizer_state *ts); + + +NPY_NO_EXPORT int +tokenizer_init(tokenizer_state *ts, parser_config *config); + +NPY_NO_EXPORT int +tokenize(stream *s, tokenizer_state *ts, parser_config *const config); + +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ */ diff --git a/numpy/core/src/npysort/quicksort.c.src b/numpy/core/src/npysort/quicksort.c.src index 933f75808..b4b060720 100644 --- a/numpy/core/src/npysort/quicksort.c.src +++ b/numpy/core/src/npysort/quicksort.c.src @@ -51,8 +51,14 @@ #include "npy_sort.h" #include "npysort_common.h" +#include "npy_cpu_features.h" +#include "x86-qsort.h" #include <stdlib.h> +#ifndef NPY_DISABLE_OPTIMIZATION + #include "x86-qsort.dispatch.h" +#endif + #define NOT_USED NPY_UNUSED(unused) /* * pushing largest partition has upper bound of log2(n) space @@ -83,11 +89,22 @@ * npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong, * npy_ushort, npy_float, npy_double, npy_longdouble, npy_cfloat, * npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta# + * #AVX512 = 0*5, 1, 1, 0*5, 1, 0*7# */ NPY_NO_EXPORT int quicksort_@suff@(void *start, npy_intp num, void *NOT_USED) { + +#if @AVX512@ + void (*dispfunc)(void*, npy_intp) = NULL; + NPY_CPU_DISPATCH_CALL_XB(dispfunc = &x86_quicksort_@suff@); + if (dispfunc) { + (*dispfunc)(start, num); + return 0; + } +#endif + @type@ vp; @type@ *pl = start; @type@ *pr = pl + num - 1; diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.c.src b/numpy/core/src/npysort/x86-qsort.dispatch.c.src new file mode 100644 index 000000000..b93c737cb --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort.dispatch.c.src @@ -0,0 +1,587 @@ +/*@targets + * $maxopt $keep_baseline avx512_skx + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "x86-qsort.h" +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#ifdef NPY_HAVE_AVX512_SKX +#include <immintrin.h> +#include "numpy/npy_math.h" +#include "npy_sort.h" +#include "simd/simd.h" + + +/* + * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are + * based on these two research papers: + * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types + * https://drops.dagstuhl.de/opus/volltexte/2021/13775/ + * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel Skylake + * https://arxiv.org/pdf/1704.08579.pdf + * + * High level idea: Vectorize the quicksort partitioning using AVX-512 + * compressstore instructions. The algorithm to pick the pivot is to use median of + * 72 elements picked at random. If the array size is < 128, then use + * Bitonic sorting network. Good resource for bitonic sorting network: + * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 + * + * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340 for + * potential problems when converting this code to universal intrinsics framework. + */ + +/* + * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ +#define NETWORK1 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 +#define NETWORK2 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3 +#define NETWORK3 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 +#define NETWORK4 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2 +#define NETWORK5 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +#define NETWORK6 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4 +#define NETWORK7 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8 +#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF) +#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32) +#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32) +#define SHUFFLE_MASK(a,b,c,d) (a << 6) | (b << 4) | (c << 2) | d +#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK) +#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK) + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +/* + * Vectorized random number generator xoroshiro128+. Broken into 2 parts: + * (1) vnext generates 2 64-bit random integers + * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to + * the length of the array + */ +#define VROTL(x, k) /* rotate each uint64_t value in vector */ \ + _mm256_or_si256(_mm256_slli_epi64((x),(k)),_mm256_srli_epi64((x),64-(k))) + +static NPY_INLINE +__m256i vnext(__m256i* s0, __m256i* s1) { + *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */ + *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1), + _mm256_slli_epi64(*s1, 16)); + *s1 = VROTL(*s1, 37); + return _mm256_add_epi64(*s0, *s1); /* return random vector */ +} + +/* transform random numbers to the range between 0 and bound - 1 */ +static NPY_INLINE +__m256i rnd_epu32(__m256i rnd_vec, __m256i bound) { + __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32); + __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound); + return _mm256_blend_epi32(odd, even, 0b01010101); +} + +/**begin repeat + * + * #TYPE = INT, UINT, FLOAT# + * #type = int, uint, float# + * #type_t = npy_int, npy_uint, npy_float# + * #zmm_t = __m512i, __m512i, __m512# + * #ymm_t = __m256i, __m256i, __m256# + * #vsuf1 = epi32, epu32, ps# + * #vsuf2 = epi32, epi32, ps# + * #vsuf3 = si512, si512, ps# + * #vsuf4 = s32, u32, f32# + * #CMP_GE_OP = _MM_CMPINT_NLT, _MM_CMPINT_NLT, _CMP_GE_OQ# + * #TYPE_MAX_VAL = NPY_MAX_INT32, NPY_MAX_UINT32, NPY_INFINITYF# + * #TYPE_MIN_VAL = NPY_MIN_INT32, 0, -NPY_INFINITYF# + */ + +/* + * COEX == Compare and Exchange two registers by swapping min and max values + */ +#define COEX_ZMM_@vsuf1@(a, b) { \ + @zmm_t@ temp = a; \ + a = _mm512_min_@vsuf1@(a,b); \ + b = _mm512_max_@vsuf1@(temp, b);} \ + +#define COEX_YMM_@vsuf1@(a, b){ \ + @ymm_t@ temp = a; \ + a = _mm256_min_@vsuf1@(a, b); \ + b = _mm256_max_@vsuf1@(temp, b);} \ + +static NPY_INLINE +@zmm_t@ cmp_merge_@vsuf1@(@zmm_t@ in1, @zmm_t@ in2, __mmask16 mask) +{ + @zmm_t@ min = _mm512_min_@vsuf1@(in2, in1); + @zmm_t@ max = _mm512_max_@vsuf1@(in2, in1); + return _mm512_mask_mov_@vsuf2@(min, mask, max); // 0 -> min, 1 -> max +} + +/* + * Assumes zmm is random and performs a full sorting network defined in + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg + */ +static NPY_INLINE +@zmm_t@ sort_zmm_@vsuf1@(@zmm_t@ zmm) +{ + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(0,1,2,3)), 0xCCCC); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK3),zmm), 0xF0F0); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(1,0,3,2)), 0xCCCC); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5),zmm), 0xFF00); + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK6),zmm), 0xF0F0); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(1,0,3,2)), 0xCCCC); + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + return zmm; +} + +// Assumes zmm is bitonic and performs a recursive half cleaner +static NPY_INLINE +@zmm_t@ bitonic_merge_zmm_@vsuf1@(@zmm_t@ zmm) +{ + // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK7),zmm), 0xFF00); + // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc .. + zmm = cmp_merge_@vsuf1@(zmm, _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK6),zmm), 0xF0F0); + // 3) half_cleaner[4] + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(1,0,3,2)), 0xCCCC); + // 3) half_cleaner[1] + zmm = cmp_merge_@vsuf1@(zmm, SHUFFLE_@vsuf2@(zmm, SHUFFLE_MASK(2,3,0,1)), 0xAAAA); + return zmm; +} + +// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner +static NPY_INLINE +void bitonic_merge_two_zmm_@vsuf1@(@zmm_t@* zmm1, @zmm_t@* zmm2) +{ + // 1) First step of a merging network: coex of zmm1 and zmm2 reversed + *zmm2 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), *zmm2); + @zmm_t@ zmm3 = _mm512_min_@vsuf1@(*zmm1, *zmm2); + @zmm_t@ zmm4 = _mm512_max_@vsuf1@(*zmm1, *zmm2); + // 2) Recursive half cleaner for each + *zmm1 = bitonic_merge_zmm_@vsuf1@(zmm3); + *zmm2 = bitonic_merge_zmm_@vsuf1@(zmm4); +} + +// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive half cleaner +static NPY_INLINE +void bitonic_merge_four_zmm_@vsuf1@(@zmm_t@* zmm) +{ + @zmm_t@ zmm2r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[2]); + @zmm_t@ zmm3r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[3]); + @zmm_t@ zmm_t1 = _mm512_min_@vsuf1@(zmm[0], zmm3r); + @zmm_t@ zmm_t2 = _mm512_min_@vsuf1@(zmm[1], zmm2r); + @zmm_t@ zmm_t3 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[1], zmm2r)); + @zmm_t@ zmm_t4 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[0], zmm3r)); + @zmm_t@ zmm0 = _mm512_min_@vsuf1@(zmm_t1, zmm_t2); + @zmm_t@ zmm1 = _mm512_max_@vsuf1@(zmm_t1, zmm_t2); + @zmm_t@ zmm2 = _mm512_min_@vsuf1@(zmm_t3, zmm_t4); + @zmm_t@ zmm3 = _mm512_max_@vsuf1@(zmm_t3, zmm_t4); + zmm[0] = bitonic_merge_zmm_@vsuf1@(zmm0); + zmm[1] = bitonic_merge_zmm_@vsuf1@(zmm1); + zmm[2] = bitonic_merge_zmm_@vsuf1@(zmm2); + zmm[3] = bitonic_merge_zmm_@vsuf1@(zmm3); +} + +static NPY_INLINE +void bitonic_merge_eight_zmm_@vsuf1@(@zmm_t@* zmm) +{ + @zmm_t@ zmm4r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[4]); + @zmm_t@ zmm5r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[5]); + @zmm_t@ zmm6r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[6]); + @zmm_t@ zmm7r = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), zmm[7]); + @zmm_t@ zmm_t1 = _mm512_min_@vsuf1@(zmm[0], zmm7r); + @zmm_t@ zmm_t2 = _mm512_min_@vsuf1@(zmm[1], zmm6r); + @zmm_t@ zmm_t3 = _mm512_min_@vsuf1@(zmm[2], zmm5r); + @zmm_t@ zmm_t4 = _mm512_min_@vsuf1@(zmm[3], zmm4r); + @zmm_t@ zmm_t5 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[3], zmm4r)); + @zmm_t@ zmm_t6 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[2], zmm5r)); + @zmm_t@ zmm_t7 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[1], zmm6r)); + @zmm_t@ zmm_t8 = _mm512_permutexvar_@vsuf2@(_mm512_set_epi32(NETWORK5), _mm512_max_@vsuf1@(zmm[0], zmm7r)); + COEX_ZMM_@vsuf1@(zmm_t1, zmm_t3); + COEX_ZMM_@vsuf1@(zmm_t2, zmm_t4); + COEX_ZMM_@vsuf1@(zmm_t5, zmm_t7); + COEX_ZMM_@vsuf1@(zmm_t6, zmm_t8); + COEX_ZMM_@vsuf1@(zmm_t1, zmm_t2); + COEX_ZMM_@vsuf1@(zmm_t3, zmm_t4); + COEX_ZMM_@vsuf1@(zmm_t5, zmm_t6); + COEX_ZMM_@vsuf1@(zmm_t7, zmm_t8); + zmm[0] = bitonic_merge_zmm_@vsuf1@(zmm_t1); + zmm[1] = bitonic_merge_zmm_@vsuf1@(zmm_t2); + zmm[2] = bitonic_merge_zmm_@vsuf1@(zmm_t3); + zmm[3] = bitonic_merge_zmm_@vsuf1@(zmm_t4); + zmm[4] = bitonic_merge_zmm_@vsuf1@(zmm_t5); + zmm[5] = bitonic_merge_zmm_@vsuf1@(zmm_t6); + zmm[6] = bitonic_merge_zmm_@vsuf1@(zmm_t7); + zmm[7] = bitonic_merge_zmm_@vsuf1@(zmm_t8); +} + +static NPY_INLINE +void sort_16_@vsuf1@(@type_t@* arr, npy_int N) +{ + __mmask16 load_mask = (0x0001 << N) - 0x0001; + @zmm_t@ zmm = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask, arr); + _mm512_mask_storeu_@vsuf2@(arr, load_mask, sort_zmm_@vsuf1@(zmm)); +} + +static NPY_INLINE +void sort_32_@vsuf1@(@type_t@* arr, npy_int N) +{ + if (N <= 16) { + sort_16_@vsuf1@(arr, N); + return; + } + @zmm_t@ zmm1 = _mm512_loadu_@vsuf3@(arr); + __mmask16 load_mask = (0x0001 << (N-16)) - 0x0001; + @zmm_t@ zmm2 = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask, arr + 16); + zmm1 = sort_zmm_@vsuf1@(zmm1); + zmm2 = sort_zmm_@vsuf1@(zmm2); + bitonic_merge_two_zmm_@vsuf1@(&zmm1, &zmm2); + _mm512_storeu_@vsuf3@(arr, zmm1); + _mm512_mask_storeu_@vsuf2@(arr + 16, load_mask, zmm2); +} + +static NPY_INLINE +void sort_64_@vsuf1@(@type_t@* arr, npy_int N) +{ + if (N <= 32) { + sort_32_@vsuf1@(arr, N); + return; + } + @zmm_t@ zmm[4]; + zmm[0] = _mm512_loadu_@vsuf3@(arr); + zmm[1] = _mm512_loadu_@vsuf3@(arr + 16); + __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; + if (N < 48) { + load_mask1 = (0x0001 << (N-32)) - 0x0001; + load_mask2 = 0x0000; + } + else if (N < 64) { + load_mask2 = (0x0001 << (N-48)) - 0x0001; + } + zmm[2] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask1, arr + 32); + zmm[3] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask2, arr + 48); + zmm[0] = sort_zmm_@vsuf1@(zmm[0]); + zmm[1] = sort_zmm_@vsuf1@(zmm[1]); + zmm[2] = sort_zmm_@vsuf1@(zmm[2]); + zmm[3] = sort_zmm_@vsuf1@(zmm[3]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[0], &zmm[1]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[2], &zmm[3]); + bitonic_merge_four_zmm_@vsuf1@(zmm); + _mm512_storeu_@vsuf3@(arr, zmm[0]); + _mm512_storeu_@vsuf3@(arr + 16, zmm[1]); + _mm512_mask_storeu_@vsuf2@(arr + 32, load_mask1, zmm[2]); + _mm512_mask_storeu_@vsuf2@(arr + 48, load_mask2, zmm[3]); +} + +static NPY_INLINE +void sort_128_@vsuf1@(@type_t@* arr, npy_int N) +{ + if (N <= 64) { + sort_64_@vsuf1@(arr, N); + return; + } + @zmm_t@ zmm[8]; + zmm[0] = _mm512_loadu_@vsuf3@(arr); + zmm[1] = _mm512_loadu_@vsuf3@(arr + 16); + zmm[2] = _mm512_loadu_@vsuf3@(arr + 32); + zmm[3] = _mm512_loadu_@vsuf3@(arr + 48); + zmm[0] = sort_zmm_@vsuf1@(zmm[0]); + zmm[1] = sort_zmm_@vsuf1@(zmm[1]); + zmm[2] = sort_zmm_@vsuf1@(zmm[2]); + zmm[3] = sort_zmm_@vsuf1@(zmm[3]); + __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; + __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF; + if (N < 80) { + load_mask1 = (0x0001 << (N-64)) - 0x0001; + load_mask2 = 0x0000; + load_mask3 = 0x0000; + load_mask4 = 0x0000; + } + else if (N < 96) { + load_mask2 = (0x0001 << (N-80)) - 0x0001; + load_mask3 = 0x0000; + load_mask4 = 0x0000; + } + else if (N < 112) { + load_mask3 = (0x0001 << (N-96)) - 0x0001; + load_mask4 = 0x0000; + } + else { + load_mask4 = (0x0001 << (N-112)) - 0x0001; + } + zmm[4] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask1, arr + 64); + zmm[5] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask2, arr + 80); + zmm[6] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask3, arr + 96); + zmm[7] = _mm512_mask_loadu_@vsuf2@(ZMM_MAX_@TYPE@, load_mask4, arr + 112); + zmm[4] = sort_zmm_@vsuf1@(zmm[4]); + zmm[5] = sort_zmm_@vsuf1@(zmm[5]); + zmm[6] = sort_zmm_@vsuf1@(zmm[6]); + zmm[7] = sort_zmm_@vsuf1@(zmm[7]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[0], &zmm[1]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[2], &zmm[3]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[4], &zmm[5]); + bitonic_merge_two_zmm_@vsuf1@(&zmm[6], &zmm[7]); + bitonic_merge_four_zmm_@vsuf1@(zmm); + bitonic_merge_four_zmm_@vsuf1@(zmm + 4); + bitonic_merge_eight_zmm_@vsuf1@(zmm); + _mm512_storeu_@vsuf3@(arr, zmm[0]); + _mm512_storeu_@vsuf3@(arr + 16, zmm[1]); + _mm512_storeu_@vsuf3@(arr + 32, zmm[2]); + _mm512_storeu_@vsuf3@(arr + 48, zmm[3]); + _mm512_mask_storeu_@vsuf2@(arr + 64, load_mask1, zmm[4]); + _mm512_mask_storeu_@vsuf2@(arr + 80, load_mask2, zmm[5]); + _mm512_mask_storeu_@vsuf2@(arr + 96, load_mask3, zmm[6]); + _mm512_mask_storeu_@vsuf2@(arr + 112, load_mask4, zmm[7]); +} + + +static NPY_INLINE +void swap_@TYPE@(@type_t@ *arr, npy_intp ii, npy_intp jj) { + @type_t@ temp = arr[ii]; + arr[ii] = arr[jj]; + arr[jj] = temp; +} + +// Median of 3 stratergy +//static NPY_INLINE +//npy_intp get_pivot_index(@type_t@ *arr, const npy_intp left, const npy_intp right) { +// return (rand() % (right + 1 - left)) + left; +// //npy_intp middle = ((right-left)/2) + left; +// //@type_t@ a = arr[left], b = arr[middle], c = arr[right]; +// //if ((b >= a && b <= c) || (b <= a && b >= c)) +// // return middle; +// //if ((a >= b && a <= c) || (a <= b && a >= c)) +// // return left; +// //else +// // return right; +//} + +/* + * Picking the pivot: Median of 72 array elements chosen at random. + */ + +static NPY_INLINE +@type_t@ get_pivot_@vsuf1@(@type_t@ *arr, const npy_intp left, const npy_intp right) { + /* seeds for vectorized random number generator */ + __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374, + 1324281658759788278, 6214952190349879213); + __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653, + 7874578921548791257, 1998265912745817298); + s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left)); + s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right)); + + npy_intp arrsize = right - left + 1; + __m256i bound = _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize); + __m512i left_vec = _mm512_set1_epi64(left); + __m512i right_vec = _mm512_set1_epi64(right); + @ymm_t@ v[9]; + /* fill 9 vectors with random numbers */ + for (npy_int i = 0; i < 9; ++i) { + __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */ + __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32(rand_64, bound)); /* random numbers between 0 and bound - 1 */ + __m512i indices; + if (i < 5) + indices = _mm512_add_epi64(left_vec, rand_32); /* indices for arr */ + else + indices = _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */ + + v[i] = _mm512_i64gather_@vsuf2@(indices, arr, sizeof(@type_t@)); + } + + /* median network for 9 elements */ + COEX_YMM_@vsuf1@(v[0], v[1]); COEX_YMM_@vsuf1@(v[2], v[3]); + COEX_YMM_@vsuf1@(v[4], v[5]); COEX_YMM_@vsuf1@(v[6], v[7]); + COEX_YMM_@vsuf1@(v[0], v[2]); COEX_YMM_@vsuf1@(v[1], v[3]); + COEX_YMM_@vsuf1@(v[4], v[6]); COEX_YMM_@vsuf1@(v[5], v[7]); + COEX_YMM_@vsuf1@(v[0], v[4]); COEX_YMM_@vsuf1@(v[1], v[2]); + COEX_YMM_@vsuf1@(v[5], v[6]); COEX_YMM_@vsuf1@(v[3], v[7]); + COEX_YMM_@vsuf1@(v[1], v[5]); COEX_YMM_@vsuf1@(v[2], v[6]); + COEX_YMM_@vsuf1@(v[3], v[5]); COEX_YMM_@vsuf1@(v[2], v[4]); + COEX_YMM_@vsuf1@(v[3], v[4]); + COEX_YMM_@vsuf1@(v[3], v[8]); + COEX_YMM_@vsuf1@(v[4], v[8]); + + // technically v[4] needs to be sorted before we pick the correct median, + // picking the 4th element works just as well for performance + @type_t@* temp = (@type_t@*) &v[4]; + + return temp[4]; +} + +/* + * Parition one ZMM register based on the pivot and returns the index of the + * last element that is less than equal to the pivot. + */ +static NPY_INLINE +npy_int partition_vec_@vsuf1@(@type_t@* arr, npy_intp left, npy_intp right, + const @zmm_t@ curr_vec, const @zmm_t@ pivot_vec, + @zmm_t@* smallest_vec, @zmm_t@* biggest_vec) +{ + /* which elements are larger than the pivot */ + __mmask16 gt_mask = _mm512_cmp_@vsuf1@_mask(curr_vec, pivot_vec, @CMP_GE_OP@); + npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask); + _mm512_mask_compressstoreu_@vsuf2@(arr + left, _knot_mask16(gt_mask), curr_vec); + _mm512_mask_compressstoreu_@vsuf2@(arr + right - amount_gt_pivot, gt_mask, curr_vec); + *smallest_vec = _mm512_min_@vsuf1@(curr_vec, *smallest_vec); + *biggest_vec = _mm512_max_@vsuf1@(curr_vec, *biggest_vec); + return amount_gt_pivot; +} + +/* + * Parition an array based on the pivot and returns the index of the + * last element that is less than equal to the pivot. + */ +static NPY_INLINE +npy_intp partition_avx512_@vsuf1@(@type_t@* arr, npy_intp left, npy_intp right, + @type_t@ pivot, @type_t@* smallest, @type_t@* biggest) +{ + /* make array length divisible by 16 , shortening the array */ + for (npy_int i = (right - left) % 16; i > 0; --i) { + *smallest = MIN(*smallest, arr[left]); + *biggest = MAX(*biggest, arr[left]); + if (arr[left] > pivot) { + swap_@TYPE@(arr, left, --right); + } + else { + ++left; + } + } + + if(left == right) + return left; /* less than 16 elements in the array */ + + @zmm_t@ pivot_vec = _mm512_set1_@vsuf2@(pivot); + @zmm_t@ min_vec = _mm512_set1_@vsuf2@(*smallest); + @zmm_t@ max_vec = _mm512_set1_@vsuf2@(*biggest); + + if(right - left == 16) { + @zmm_t@ vec = _mm512_loadu_@vsuf3@(arr + left); + npy_int amount_gt_pivot = partition_vec_@vsuf1@(arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec); + *smallest = npyv_reducemin_@vsuf4@(min_vec); + *biggest = npyv_reducemax_@vsuf4@(max_vec); + return left + (16 - amount_gt_pivot); + } + + // first and last 16 values are partitioned at the end + @zmm_t@ vec_left = _mm512_loadu_@vsuf3@(arr + left); + @zmm_t@ vec_right = _mm512_loadu_@vsuf3@(arr + (right - 16)); + // store points of the vectors + npy_intp r_store = right - 16; + npy_intp l_store = left; + // indices for loading the elements + left += 16; + right -= 16; + while(right - left != 0) { + @zmm_t@ curr_vec; + /* + * if fewer elements are stored on the right side of the array, + * then next elements are loaded from the right side, + * otherwise from the left side + */ + if((r_store + 16) - right < left - l_store) { + right -= 16; + curr_vec = _mm512_loadu_@vsuf3@(arr + right); + } + else { + curr_vec = _mm512_loadu_@vsuf3@(arr + left); + left += 16; + } + // partition the current vector and save it on both sides of the array + npy_int amount_gt_pivot = partition_vec_@vsuf1@(arr, l_store, r_store + 16, curr_vec, pivot_vec, &min_vec, &max_vec);; + r_store -= amount_gt_pivot; l_store += (16 - amount_gt_pivot); + } + + /* partition and save vec_left and vec_right */ + npy_int amount_gt_pivot = partition_vec_@vsuf1@(arr, l_store, r_store + 16, vec_left, pivot_vec, &min_vec, &max_vec); + l_store += (16 - amount_gt_pivot); + amount_gt_pivot = partition_vec_@vsuf1@(arr, l_store, l_store + 16, vec_right, pivot_vec, &min_vec, &max_vec); + l_store += (16 - amount_gt_pivot); + *smallest = npyv_reducemin_@vsuf4@(min_vec); + *biggest = npyv_reducemax_@vsuf4@(max_vec); + return l_store; +} + +static NPY_INLINE +void qsort_@type@(@type_t@* arr, npy_intp left, npy_intp right, npy_int max_iters) +{ + /* + * Resort to heapsort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + heapsort_@type@((void*)(arr + left), right + 1 - left, NULL); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_@vsuf1@(arr + left, right + 1 -left); + return; + } + + @type_t@ pivot = get_pivot_@vsuf1@(arr, left, right); + @type_t@ smallest = @TYPE_MAX_VAL@; + @type_t@ biggest = @TYPE_MIN_VAL@; + npy_intp pivot_index = partition_avx512_@vsuf1@(arr, left, right+1, pivot, &smallest, &biggest); + if (pivot != smallest) + qsort_@type@(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) + qsort_@type@(arr, pivot_index, right, max_iters - 1); +} +/**end repeat**/ + +static NPY_INLINE +npy_intp replace_nan_with_inf(npy_float* arr, npy_intp arrsize) +{ + npy_intp nan_count = 0; + __mmask16 loadmask = 0xFFFF; + while (arrsize > 0) { + if (arrsize < 16) { + loadmask = (0x0001 << arrsize) - 0x0001; + } + __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr); + __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); + nan_count += _mm_popcnt_u32((npy_int) nanmask); + _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT); + arr += 16; + arrsize -= 16; + } + return nan_count; +} + +static NPY_INLINE +void replace_inf_with_nan(npy_float* arr, npy_intp arrsize, npy_intp nan_count) +{ + for (npy_intp ii = arrsize-1; nan_count > 0; --ii) { + arr[ii] = NPY_NANF; + nan_count -= 1; + } +} + +/**begin repeat + * + * #type = int, uint, float# + * #type_t = npy_int, npy_uint, npy_float# + * #FIXNAN = 0, 0, 1# + */ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(x86_quicksort_@type@) +(void* arr, npy_intp arrsize) +{ + if (arrsize > 1) { +#if @FIXNAN@ + npy_intp nan_count = replace_nan_with_inf((@type_t@*) arr, arrsize); +#endif + qsort_@type@((@type_t@*) arr, 0, arrsize-1, 2*log2(arrsize)); +#if @FIXNAN@ + replace_inf_with_nan((@type_t@*) arr, arrsize, nan_count); +#endif + } +} +/**end repeat**/ + +#endif // NPY_HAVE_AVX512_SKX diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h new file mode 100644 index 000000000..8cb8e3654 --- /dev/null +++ b/numpy/core/src/npysort/x86-qsort.h @@ -0,0 +1,18 @@ +#include "numpy/npy_common.h" +#include "npy_cpu_dispatch.h" + +#ifndef NPY_NO_EXPORT + #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "x86-qsort.dispatch.h" +#endif +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint, + (void *start, npy_intp num)) + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float, + (void *start, npy_intp num)) diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 708e82910..73bb5e2d8 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -9278,3 +9278,52 @@ class TestViewDtype: [[1284, 1798], [4368, 4882]], [[2312, 2826], [5396, 5910]]] assert_array_equal(x.view('<i2'), expected) + + +# Test various array sizes that hit different code paths in quicksort-avx512 +@pytest.mark.parametrize("N", [8, 16, 24, 32, 48, 64, 96, 128, 151, 191, + 256, 383, 512, 1023, 2047]) +def test_sort_float(N): + # Regular data with nan sprinkled + np.random.seed(42) + arr = -0.5 + np.random.sample(N).astype('f') + arr[np.random.choice(arr.shape[0], 3)] = np.nan + assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) + + # (2) with +INF + infarr = np.inf*np.ones(N, dtype='f') + infarr[np.random.choice(infarr.shape[0], 5)] = -1.0 + assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) + + # (3) with -INF + neginfarr = -np.inf*np.ones(N, dtype='f') + neginfarr[np.random.choice(neginfarr.shape[0], 5)] = 1.0 + assert_equal(np.sort(neginfarr, kind='quick'), + np.sort(neginfarr, kind='heap')) + + # (4) with +/-INF + infarr = np.inf*np.ones(N, dtype='f') + infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf + assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap')) + + +def test_sort_int(): + # Random data with NPY_MAX_INT32 and NPY_MIN_INT32 sprinkled + rng = np.random.default_rng(42) + N = 2047 + minv = np.iinfo(np.int32).min + maxv = np.iinfo(np.int32).max + arr = rng.integers(low=minv, high=maxv, size=N).astype('int32') + arr[np.random.choice(arr.shape[0], 10)] = minv + arr[np.random.choice(arr.shape[0], 10)] = maxv + assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) + + +def test_sort_uint(): + # Random data with NPY_MAX_UINT32 sprinkled + rng = np.random.default_rng(42) + N = 2047 + maxv = np.iinfo(np.uint32).max + arr = rng.integers(low=0, high=maxv, size=N).astype('uint32') + arr[np.random.choice(arr.shape[0], 10)] = maxv + assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) |