#include #include #include #include #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE #include "lowlevel_strided_loops.h" #include "conversions.h" #include "str_to_int.h" #include "array_coercion.h" /* * Coercion to boolean is done via integer right now. */ NPY_NO_EXPORT int npy_to_bool(PyArray_Descr *NPY_UNUSED(descr), const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(pconfig)) { int64_t res; if (str_to_int64(str, end, INT64_MIN, INT64_MAX, &res) < 0) { return -1; } *dataptr = (char)(res != 0); return 0; } /* * In order to not pack a whole copy of a floating point parser, we copy the * result into ascii and call the Python one. Float parsing isn't super quick * so this is not terrible, but avoiding it would speed up things. * * Also note that parsing the first float of a complex will copy the whole * string to ascii rather than just the first part. * TODO: A tweak of the break might be a simple mitigation there. * * @param str The UCS4 string to parse * @param end Pointer to the end of the string * @param skip_trailing_whitespace If false does not skip trailing whitespace * (used by the complex parser). * @param result Output stored as double value. */ static inline int double_from_ucs4( const Py_UCS4 *str, const Py_UCS4 *end, bool strip_whitespace, double *result, const Py_UCS4 **p_end) { /* skip leading whitespace */ if (strip_whitespace) { while (Py_UNICODE_ISSPACE(*str)) { str++; } } if (str == end) { return -1; /* empty or only whitespace: not a floating point number */ } /* We convert to ASCII for the Python parser, use stack if small: */ char stack_buf[128]; char *heap_buf = NULL; char *ascii = stack_buf; size_t str_len = end - str + 1; if (str_len > 128) { heap_buf = PyMem_MALLOC(str_len); if (heap_buf == NULL) { PyErr_NoMemory(); return -1; } ascii = heap_buf; } char *c = ascii; for (; str < end; str++, c++) { if (NPY_UNLIKELY(*str >= 128)) { /* Character cannot be used, ignore for end calculation and stop */ end = str; break; } *c = (char)(*str); } *c = '\0'; char *end_parsed; *result = PyOS_string_to_double(ascii, &end_parsed, NULL); /* Rewind `end` to the first UCS4 character not parsed: */ end = end - (c - end_parsed); PyMem_FREE(heap_buf); if (*result == -1. && PyErr_Occurred()) { return -1; } if (strip_whitespace) { /* and then skip any remaining whitespace: */ while (Py_UNICODE_ISSPACE(*end)) { end++; } } *p_end = end; return 0; } NPY_NO_EXPORT int npy_to_float(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(pconfig)) { double double_val; const Py_UCS4 *p_end; if (double_from_ucs4(str, end, true, &double_val, &p_end) < 0) { return -1; } if (p_end != end) { return -1; } float val = (float)double_val; memcpy(dataptr, &val, sizeof(float)); if (!PyArray_ISNBO(descr->byteorder)) { npy_bswap4_unaligned(dataptr); } return 0; } NPY_NO_EXPORT int npy_to_double(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(pconfig)) { double val; const Py_UCS4 *p_end; if (double_from_ucs4(str, end, true, &val, &p_end) < 0) { return -1; } if (p_end != end) { return -1; } memcpy(dataptr, &val, sizeof(double)); if (!PyArray_ISNBO(descr->byteorder)) { npy_bswap8_unaligned(dataptr); } return 0; } static bool to_complex_int( const Py_UCS4 *item, const Py_UCS4 *token_end, double *p_real, double *p_imag, Py_UCS4 imaginary_unit, bool allow_parens) { const Py_UCS4 *p_end; bool unmatched_opening_paren = false; /* Remove whitespace before the possibly leading '(' */ while (Py_UNICODE_ISSPACE(*item)) { ++item; } if (allow_parens && (*item == '(')) { unmatched_opening_paren = true; ++item; /* Allow whitespace within the parentheses: "( 1j)" */ while (Py_UNICODE_ISSPACE(*item)) { ++item; } } if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) { return false; } if (p_end == token_end) { // No imaginary part in the string (e.g. "3.5") *p_imag = 0.0; return !unmatched_opening_paren; } if (*p_end == imaginary_unit) { /* Only an imaginary part (e.g "1.5j") */ *p_imag = *p_real; *p_real = 0.0; ++p_end; } else if (*p_end == '+' || *p_end == '-') { /* Imaginary part still to parse */ if (*p_end == '+') { ++p_end; /* Advance to support +- (and ++) */ } if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) { return false; } if (*p_end != imaginary_unit) { return false; } ++p_end; } else { *p_imag = 0; } if (unmatched_opening_paren) { /* Allow whitespace inside brackets as in "(1+2j )" or "( 1j )" */ while (Py_UNICODE_ISSPACE(*p_end)) { ++p_end; } if (*p_end == ')') { ++p_end; } else { /* parentheses was not closed */ return false; } } while (Py_UNICODE_ISSPACE(*p_end)) { ++p_end; } return p_end == token_end; } NPY_NO_EXPORT int npy_to_cfloat(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig) { double real; double imag; bool success = to_complex_int( str, end, &real, &imag, pconfig->imaginary_unit, true); if (!success) { return -1; } npy_complex64 val = {(float)real, (float)imag}; memcpy(dataptr, &val, sizeof(npy_complex64)); if (!PyArray_ISNBO(descr->byteorder)) { npy_bswap4_unaligned(dataptr); npy_bswap4_unaligned(dataptr + 4); } return 0; } NPY_NO_EXPORT int npy_to_cdouble(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig) { double real; double imag; bool success = to_complex_int( str, end, &real, &imag, pconfig->imaginary_unit, true); if (!success) { return -1; } npy_complex128 val = {real, imag}; memcpy(dataptr, &val, sizeof(npy_complex128)); if (!PyArray_ISNBO(descr->byteorder)) { npy_bswap8_unaligned(dataptr); npy_bswap8_unaligned(dataptr + 8); } return 0; } /* * String and unicode conversion functions. */ NPY_NO_EXPORT int npy_to_string(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(unused)) { const Py_UCS4* c = str; size_t length = descr->elsize; for (size_t i = 0; i < length; i++) { if (c < end) { /* * loadtxt assumed latin1, which is compatible with UCS1 (first * 256 unicode characters). */ if (NPY_UNLIKELY(*c > 255)) { /* TODO: Was UnicodeDecodeError, is unspecific error good? */ return -1; } dataptr[i] = (Py_UCS1)(*c); c++; } else { dataptr[i] = '\0'; } } return 0; } NPY_NO_EXPORT int npy_to_unicode(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(unused)) { int length = descr->elsize / 4; if (length <= end - str) { memcpy(dataptr, str, length * 4); } else { size_t given_len = end - str; memcpy(dataptr, str, given_len * 4); memset(dataptr + given_len * 4, '\0', (length - given_len) * 4); } if (!PyArray_ISNBO(descr->byteorder)) { for (int i = 0; i < length; i++) { npy_bswap4_unaligned(dataptr); dataptr += 4; } } return 0; } /* * Convert functions helper for the generic converter. */ static PyObject * call_converter_function( PyObject *func, const Py_UCS4 *str, size_t length, bool byte_converters) { PyObject *s = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str, length); if (s == NULL) { return s; } if (byte_converters) { Py_SETREF(s, PyUnicode_AsEncodedString(s, "latin1", NULL)); if (s == NULL) { return NULL; } } if (func == NULL) { return s; } PyObject *result = PyObject_CallFunctionObjArgs(func, s, NULL); Py_DECREF(s); return result; } NPY_NO_EXPORT int npy_to_generic_with_converter(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *config, PyObject *func) { bool use_byte_converter; if (func == NULL) { use_byte_converter = config->c_byte_converters; } else { use_byte_converter = config->python_byte_converters; } /* Converts to unicode and calls custom converter (if set) */ PyObject *converted = call_converter_function( func, str, (size_t)(end - str), use_byte_converter); if (converted == NULL) { return -1; } int res = PyArray_Pack(descr, dataptr, converted); Py_DECREF(converted); return res; } NPY_NO_EXPORT int npy_to_generic(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *config) { return npy_to_generic_with_converter(descr, str, end, dataptr, config, NULL); }