diff options
| author | Eric Wieser <wieser.eric@gmail.com> | 2020-08-24 13:14:17 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-08-24 13:14:17 +0100 |
| commit | 018fb37987e02fdba377b008e5ca3d68e4611cb2 (patch) | |
| tree | 5288e8bc6b1c01ddaae982e4659fb51b43cded38 /numpy/core | |
| parent | 117b3f4a293c689777d3cd445969ca99acb01cfe (diff) | |
| parent | fa45695b787070275696e979eb9f476ef3afc5e6 (diff) | |
| download | numpy-018fb37987e02fdba377b008e5ca3d68e4611cb2.tar.gz | |
Merge branch 'master' into cleanup-Long
Diffstat (limited to 'numpy/core')
39 files changed, 2265 insertions, 2286 deletions
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py index 5d9642ea8..ad1530419 100644 --- a/numpy/core/arrayprint.py +++ b/numpy/core/arrayprint.py @@ -1628,6 +1628,3 @@ def set_string_function(f, repr=True): return multiarray.set_string_function(_default_array_str, 0) else: return multiarray.set_string_function(f, repr) - -set_string_function(_default_array_str, False) -set_string_function(_default_array_repr, True) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index aede12080..a4a84397d 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -790,6 +790,8 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'descriptor.h'), join('src', 'multiarray', 'dtypemeta.h'), join('src', 'multiarray', 'dragon4.h'), + join('src', 'multiarray', 'einsum_debug.h'), + join('src', 'multiarray', 'einsum_sumprod.h'), join('src', 'multiarray', 'getset.h'), join('src', 'multiarray', 'hashdescr.h'), join('src', 'multiarray', 'iterators.h'), @@ -853,6 +855,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'dragon4.c'), join('src', 'multiarray', 'dtype_transfer.c'), join('src', 'multiarray', 'einsum.c.src'), + join('src', 'multiarray', 'einsum_sumprod.c.src'), join('src', 'multiarray', 'flagsobject.c'), join('src', 'multiarray', 'getset.c'), join('src', 'multiarray', 'hashdescr.c'), diff --git a/numpy/core/src/common/array_assign.c b/numpy/core/src/common/array_assign.c index d626d1260..e365b49e4 100644 --- a/numpy/core/src/common/array_assign.c +++ b/numpy/core/src/common/array_assign.c @@ -67,12 +67,12 @@ broadcast_strides(int ndim, npy_intp const *shape, broadcast_error: { PyObject *errmsg; - errmsg = PyUString_FromFormat("could not broadcast %s from shape ", + errmsg = PyUnicode_FromFormat("could not broadcast %s from shape ", strides_name); PyUString_ConcatAndDel(&errmsg, build_shape_string(strides_ndim, strides_shape)); PyUString_ConcatAndDel(&errmsg, - PyUString_FromString(" into shape ")); + PyUnicode_FromString(" into shape ")); PyUString_ConcatAndDel(&errmsg, build_shape_string(ndim, shape)); PyErr_SetObject(PyExc_ValueError, errmsg); diff --git a/numpy/core/src/common/npy_cblas.h b/numpy/core/src/common/npy_cblas.h index 97308238a..072993ec2 100644 --- a/numpy/core/src/common/npy_cblas.h +++ b/numpy/core/src/common/npy_cblas.h @@ -47,8 +47,10 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef HAVE_BLAS_ILP64 #define CBLAS_INT npy_int64 +#define CBLAS_INT_MAX NPY_MAX_INT64 #else #define CBLAS_INT int +#define CBLAS_INT_MAX INT_MAX #endif #define BLASNAME(name) CBLAS_FUNC(name) @@ -59,6 +61,39 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #undef BLASINT #undef BLASNAME + +/* + * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done + * (BLAS won't handle negative or zero strides the way we want). + */ +static NPY_INLINE CBLAS_INT +blas_stride(npy_intp stride, unsigned itemsize) +{ + /* + * Should probably check pointer alignment also, but this may cause + * problems if we require complex to be 16 byte aligned. + */ + if (stride > 0 && (stride % itemsize) == 0) { + stride /= itemsize; + if (stride <= CBLAS_INT_MAX) { + return stride; + } + } + return 0; +} + +/* + * Define a chunksize for CBLAS. + * + * The chunksize is the greatest power of two less than CBLAS_INT_MAX. + */ +#if NPY_MAX_INTP > CBLAS_INT_MAX +# define NPY_CBLAS_CHUNK (CBLAS_INT_MAX / 2 + 1) +#else +# define NPY_CBLAS_CHUNK NPY_MAX_INTP +#endif + + #ifdef __cplusplus } #endif diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src index da631c830..9c1fa0bad 100644 --- a/numpy/core/src/multiarray/_multiarray_tests.c.src +++ b/numpy/core/src/multiarray/_multiarray_tests.c.src @@ -1902,7 +1902,7 @@ PrintFloat_Printf_g(PyObject *obj, int precision) PyOS_snprintf(str, sizeof(str), "%.*g", precision, val); } - return PyUString_FromString(str); + return PyUnicode_FromString(str); } diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c index 95c650674..5da1b5f29 100644 --- a/numpy/core/src/multiarray/arrayobject.c +++ b/numpy/core/src/multiarray/arrayobject.c @@ -416,7 +416,7 @@ WARN_IN_DEALLOC(PyObject* warning, const char * msg) { if (PyErr_WarnEx(warning, msg, 1) < 0) { PyObject * s; - s = PyUString_FromString("array_dealloc"); + s = PyUnicode_FromString("array_dealloc"); if (s) { PyErr_WriteUnraisable(s); Py_DECREF(s); diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index 9508fb5ad..3fee587b9 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -865,7 +865,7 @@ VOID_setitem(PyObject *op, void *input, void *vap) npy_intp names_size = PyTuple_GET_SIZE(descr->names); if (names_size != PyTuple_Size(op)) { - errmsg = PyUString_FromFormat( + errmsg = PyUnicode_FromFormat( "could not assign tuple of length %zd to structure " "with %" NPY_INTP_FMT " fields.", PyTuple_Size(op), names_size); diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c index 25bb2d195..af40cdc2c 100644 --- a/numpy/core/src/multiarray/buffer.c +++ b/numpy/core/src/multiarray/buffer.c @@ -931,7 +931,7 @@ _descriptor_from_pep3118_format(char const *s) } *p = '\0'; - str = PyUString_FromStringAndSize(buf, strlen(buf)); + str = PyUnicode_FromStringAndSize(buf, strlen(buf)); if (str == NULL) { free(buf); return NULL; diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c index 3d3ac7709..5f8250fb7 100644 --- a/numpy/core/src/multiarray/common.c +++ b/numpy/core/src/multiarray/common.c @@ -133,7 +133,7 @@ NPY_NO_EXPORT PyArray_Descr * _array_typedescr_fromstr(char const *c_str) { PyArray_Descr *descr = NULL; - PyObject *stringobj = PyString_FromString(c_str); + PyObject *stringobj = PyBytes_FromString(c_str); if (stringobj == NULL) { return NULL; @@ -264,10 +264,10 @@ convert_shape_to_string(npy_intp n, npy_intp const *vals, char *ending) for (i = 0; i < n && vals[i] < 0; i++); if (i == n) { - return PyUString_FromFormat("()%s", ending); + return PyUnicode_FromFormat("()%s", ending); } else { - ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]); + ret = PyUnicode_FromFormat("(%" NPY_INTP_FMT, vals[i++]); if (ret == NULL) { return NULL; } @@ -275,10 +275,10 @@ convert_shape_to_string(npy_intp n, npy_intp const *vals, char *ending) for (; i < n; ++i) { if (vals[i] < 0) { - tmp = PyUString_FromString(",newaxis"); + tmp = PyUnicode_FromString(",newaxis"); } else { - tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]); + tmp = PyUnicode_FromFormat(",%" NPY_INTP_FMT, vals[i]); } if (tmp == NULL) { Py_DECREF(ret); @@ -292,10 +292,10 @@ convert_shape_to_string(npy_intp n, npy_intp const *vals, char *ending) } if (i == 1) { - tmp = PyUString_FromFormat(",)%s", ending); + tmp = PyUnicode_FromFormat(",)%s", ending); } else { - tmp = PyUString_FromFormat(")%s", ending); + tmp = PyUnicode_FromFormat(")%s", ending); } PyUString_ConcatAndDel(&ret, tmp); return ret; @@ -310,7 +310,7 @@ dot_alignment_error(PyArrayObject *a, int i, PyArrayObject *b, int j) *shape1 = NULL, *shape2 = NULL, *shape1_i = NULL, *shape2_j = NULL; - format = PyUString_FromString("shapes %s and %s not aligned:" + format = PyUnicode_FromString("shapes %s and %s not aligned:" " %d (dim %d) != %d (dim %d)"); shape1 = convert_shape_to_string(PyArray_NDIM(a), PyArray_DIMS(a), ""); @@ -333,7 +333,7 @@ dot_alignment_error(PyArrayObject *a, int i, PyArrayObject *b, int j) goto end; } - errmsg = PyUString_Format(format, fmt_args); + errmsg = PyUnicode_Format(format, fmt_args); if (errmsg != NULL) { PyErr_SetObject(PyExc_ValueError, errmsg); } diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h index 793cefaf8..4410825fa 100644 --- a/numpy/core/src/multiarray/common.h +++ b/numpy/core/src/multiarray/common.h @@ -292,43 +292,6 @@ npy_memchr(char * haystack, char needle, return p; } -/* - * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done - * (BLAS won't handle negative or zero strides the way we want). - */ -static NPY_INLINE int -blas_stride(npy_intp stride, unsigned itemsize) -{ - /* - * Should probably check pointer alignment also, but this may cause - * problems if we require complex to be 16 byte aligned. - */ - if (stride > 0 && npy_is_aligned((void *)stride, itemsize)) { - stride /= itemsize; -#ifndef HAVE_BLAS_ILP64 - if (stride <= INT_MAX) { -#else - if (stride <= NPY_MAX_INT64) { -#endif - return stride; - } - } - return 0; -} - -/* - * Define a chunksize for CBLAS. CBLAS counts in integers. - */ -#if NPY_MAX_INTP > INT_MAX -# ifndef HAVE_BLAS_ILP64 -# define NPY_CBLAS_CHUNK (INT_MAX / 2 + 1) -# else -# define NPY_CBLAS_CHUNK (NPY_MAX_INT64 / 2 + 1) -# endif -#else -# define NPY_CBLAS_CHUNK NPY_MAX_INTP -#endif - #include "ucsnarrow.h" /* diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c index 41a10afdc..b68b9322d 100644 --- a/numpy/core/src/multiarray/convert.c +++ b/numpy/core/src/multiarray/convert.c @@ -248,13 +248,13 @@ PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format) return -1; } PyTuple_SET_ITEM(tupobj,0,obj); - obj = PyUString_FromString((const char *)format); + obj = PyUnicode_FromString((const char *)format); if (obj == NULL) { Py_DECREF(tupobj); Py_DECREF(it); return -1; } - strobj = PyUString_Format(obj, tupobj); + strobj = PyUnicode_Format(obj, tupobj); Py_DECREF(obj); Py_DECREF(tupobj); if (strobj == NULL) { diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c index bd038c53a..1f5845eb7 100644 --- a/numpy/core/src/multiarray/convert_datatype.c +++ b/numpy/core/src/multiarray/convert_datatype.c @@ -95,8 +95,11 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num) key = PyLong_FromLong(type_num); cobj = PyDict_GetItem(obj, key); Py_DECREF(key); - if (cobj && NpyCapsule_Check(cobj)) { - castfunc = NpyCapsule_AsVoidPtr(cobj); + if (cobj && PyCapsule_CheckExact(cobj)) { + castfunc = PyCapsule_GetPointer(cobj, NULL); + if (castfunc == NULL) { + return NULL; + } } } } diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c index 3ff397817..6add032bf 100644 --- a/numpy/core/src/multiarray/ctors.c +++ b/numpy/core/src/multiarray/ctors.c @@ -868,11 +868,14 @@ PyArray_NewFromDescr_int( func = PyObject_GetAttr((PyObject *)fa, npy_ma_str_array_finalize); if (func && func != Py_None) { - if (NpyCapsule_Check(func)) { + if (PyCapsule_CheckExact(func)) { /* A C-function is stored here */ PyArray_FinalizeFunc *cfunc; - cfunc = NpyCapsule_AsVoidPtr(func); + cfunc = PyCapsule_GetPointer(func, NULL); Py_DECREF(func); + if (cfunc == NULL) { + goto fail; + } if (cfunc((PyArrayObject *)fa, obj) < 0) { goto fail; } @@ -1747,7 +1750,7 @@ PyArray_FromStructInterface(PyObject *input) return Py_NotImplemented; } } - if (!NpyCapsule_Check(attr)) { + if (!PyCapsule_CheckExact(attr)) { if (PyType_Check(input) && PyObject_HasAttrString(attr, "__get__")) { /* * If the input is a class `attr` should be a property-like object. @@ -1759,7 +1762,10 @@ PyArray_FromStructInterface(PyObject *input) } goto fail; } - inter = NpyCapsule_AsVoidPtr(attr); + inter = PyCapsule_GetPointer(attr, NULL); + if (inter == NULL) { + goto fail; + } if (inter->two != 2) { goto fail; } diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c index 3649bbe4c..348473309 100644 --- a/numpy/core/src/multiarray/datetime.c +++ b/numpy/core/src/multiarray/datetime.c @@ -1435,14 +1435,14 @@ raise_if_datetime64_metadata_cast_error(char *object_type, } else { PyObject *errmsg; - errmsg = PyUString_FromFormat("Cannot cast %s " + errmsg = PyUnicode_FromFormat("Cannot cast %s " "from metadata ", object_type); errmsg = append_metastr_to_string(src_meta, 0, errmsg); PyUString_ConcatAndDel(&errmsg, - PyUString_FromString(" to ")); + PyUnicode_FromString(" to ")); errmsg = append_metastr_to_string(dst_meta, 0, errmsg); PyUString_ConcatAndDel(&errmsg, - PyUString_FromFormat(" according to the rule %s", + PyUnicode_FromFormat(" according to the rule %s", npy_casting_to_string(casting))); PyErr_SetObject(PyExc_TypeError, errmsg); Py_DECREF(errmsg); @@ -1467,14 +1467,14 @@ raise_if_timedelta64_metadata_cast_error(char *object_type, } else { PyObject *errmsg; - errmsg = PyUString_FromFormat("Cannot cast %s " + errmsg = PyUnicode_FromFormat("Cannot cast %s " "from metadata ", object_type); errmsg = append_metastr_to_string(src_meta, 0, errmsg); PyUString_ConcatAndDel(&errmsg, - PyUString_FromString(" to ")); + PyUnicode_FromString(" to ")); errmsg = append_metastr_to_string(dst_meta, 0, errmsg); PyUString_ConcatAndDel(&errmsg, - PyUString_FromFormat(" according to the rule %s", + PyUnicode_FromFormat(" according to the rule %s", npy_casting_to_string(casting))); PyErr_SetObject(PyExc_TypeError, errmsg); Py_DECREF(errmsg); @@ -1601,15 +1601,15 @@ compute_datetime_metadata_greatest_common_divisor( incompatible_units: { PyObject *errmsg; - errmsg = PyUString_FromString("Cannot get " + errmsg = PyUnicode_FromString("Cannot get " "a common metadata divisor for " "NumPy datetime metadata "); errmsg = append_metastr_to_string(meta1, 0, errmsg); PyUString_ConcatAndDel(&errmsg, - PyUString_FromString(" and ")); + PyUnicode_FromString(" and ")); errmsg = append_metastr_to_string(meta2, 0, errmsg); PyUString_ConcatAndDel(&errmsg, - PyUString_FromString(" because they have " + PyUnicode_FromString(" because they have " "incompatible nonlinear base time units")); PyErr_SetObject(PyExc_TypeError, errmsg); Py_DECREF(errmsg); @@ -1617,12 +1617,12 @@ incompatible_units: { } units_overflow: { PyObject *errmsg; - errmsg = PyUString_FromString("Integer overflow " + errmsg = PyUnicode_FromString("Integer overflow " "getting a common metadata divisor for " "NumPy datetime metadata "); errmsg = append_metastr_to_string(meta1, 0, errmsg); PyUString_ConcatAndDel(&errmsg, - PyUString_FromString(" and ")); + PyUnicode_FromString(" and ")); errmsg = append_metastr_to_string(meta2, 0, errmsg); PyErr_SetObject(PyExc_OverflowError, errmsg); Py_DECREF(errmsg); @@ -1747,7 +1747,7 @@ convert_datetime_metadata_to_tuple(PyArray_DatetimeMetaData *meta) } PyTuple_SET_ITEM(dt_tuple, 0, - PyUString_FromString(_datetime_strings[meta->base])); + PyUnicode_FromString(_datetime_strings[meta->base])); PyTuple_SET_ITEM(dt_tuple, 1, PyLong_FromLong(meta->num)); @@ -1771,7 +1771,7 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple, if (!PyTuple_Check(tuple)) { PyObject *errmsg; - errmsg = PyUString_FromString("Require tuple for tuple to NumPy " + errmsg = PyUnicode_FromString("Require tuple for tuple to NumPy " "datetime metadata conversion, not "); PyUString_ConcatAndDel(&errmsg, PyObject_Repr(tuple)); PyErr_SetObject(PyExc_TypeError, errmsg); @@ -1973,7 +1973,7 @@ append_metastr_to_string(PyArray_DatetimeMetaData *meta, if (meta->base == NPY_FR_GENERIC) { /* Without brackets, give a string "generic" */ if (skip_brackets) { - PyUString_ConcatAndDel(&ret, PyUString_FromString("generic")); + PyUString_ConcatAndDel(&ret, PyUnicode_FromString("generic")); return ret; } /* But with brackets, append nothing */ @@ -1994,18 +1994,18 @@ append_metastr_to_string(PyArray_DatetimeMetaData *meta, if (num == 1) { if (skip_brackets) { - res = PyUString_FromFormat("%s", basestr); + res = PyUnicode_FromFormat("%s", basestr); } else { - res = PyUString_FromFormat("[%s]", basestr); + res = PyUnicode_FromFormat("[%s]", basestr); } } else { if (skip_brackets) { - res = PyUString_FromFormat("%d%s", num, basestr); + res = PyUnicode_FromFormat("%d%s", num, basestr); } else { - res = PyUString_FromFormat("[%d%s]", num, basestr); + res = PyUnicode_FromFormat("[%d%s]", num, basestr); } } diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c index f47f0ce06..95597b812 100644 --- a/numpy/core/src/multiarray/descriptor.c +++ b/numpy/core/src/multiarray/descriptor.c @@ -472,7 +472,7 @@ _convert_from_array_descr(PyObject *obj, int align) if (PyUnicode_GetLength(name) == 0) { Py_DECREF(name); if (title == NULL) { - name = PyUString_FromFormat("f%d", i); + name = PyUnicode_FromFormat("f%d", i); if (name == NULL) { goto fail; } @@ -673,7 +673,7 @@ _convert_from_list(PyObject *obj, int align) } PyTuple_SET_ITEM(tup, 0, (PyObject *)conv); PyTuple_SET_ITEM(tup, 1, size_obj); - PyObject *key = PyUString_FromFormat("f%d", i); + PyObject *key = PyUnicode_FromFormat("f%d", i); if (!key) { Py_DECREF(tup); goto fail; @@ -1887,10 +1887,10 @@ arraydescr_protocol_typestr_get(PyArray_Descr *self) size >>= 2; } if (self->type_num == NPY_OBJECT) { - ret = PyUString_FromFormat("%c%c", endian, basic_); + ret = PyUnicode_FromFormat("%c%c", endian, basic_); } else { - ret = PyUString_FromFormat("%c%c%d", endian, basic_, size); + ret = PyUnicode_FromFormat("%c%c%d", endian, basic_, size); } if (PyDataType_ISDATETIME(self)) { PyArray_DatetimeMetaData *meta; @@ -1974,7 +1974,7 @@ arraydescr_protocol_descr_get(PyArray_Descr *self) if (dobj == NULL) { return NULL; } - PyTuple_SET_ITEM(dobj, 0, PyUString_FromString("")); + PyTuple_SET_ITEM(dobj, 0, PyUnicode_FromString("")); PyTuple_SET_ITEM(dobj, 1, arraydescr_protocol_typestr_get(self)); res = PyList_New(1); if (res == NULL) { @@ -2450,7 +2450,7 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args)) if (self->type_num == NPY_UNICODE) { elsize >>= 2; } - obj = PyUString_FromFormat("%c%d",self->kind, elsize); + obj = PyUnicode_FromFormat("%c%d",self->kind, elsize); } PyTuple_SET_ITEM(ret, 1, Py_BuildValue("(NOO)", obj, Py_False, Py_True)); @@ -2492,7 +2492,7 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args)) PyTuple_SET_ITEM(state, 0, PyLong_FromLong(3)); } - PyTuple_SET_ITEM(state, 1, PyUString_FromFormat("%c", endian)); + PyTuple_SET_ITEM(state, 1, PyUnicode_FromFormat("%c", endian)); PyTuple_SET_ITEM(state, 2, arraydescr_subdescr_get(self)); if (PyDataType_HASFIELDS(self)) { Py_INCREF(self->names); @@ -2894,7 +2894,7 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args) PyArray_DatetimeMetaData temp_dt_data; if ((! PyTuple_Check(metadata)) || (PyTuple_Size(metadata) != 2)) { - errmsg = PyUString_FromString("Invalid datetime dtype (metadata, c_metadata): "); + errmsg = PyUnicode_FromString("Invalid datetime dtype (metadata, c_metadata): "); PyUString_ConcatAndDel(&errmsg, PyObject_Repr(metadata)); PyErr_SetObject(PyExc_ValueError, errmsg); Py_DECREF(errmsg); @@ -3393,7 +3393,7 @@ arraydescr_field_subset_view(PyArray_Descr *self, PyObject *ind) /* disallow duplicate field indices */ if (PyDict_Contains(fields, name)) { PyObject *msg = NULL; - PyObject *fmt = PyUString_FromString( + PyObject *fmt = PyUnicode_FromString( "duplicate field of name {!r}"); if (fmt != NULL) { msg = PyObject_CallMethod(fmt, "format", "O", name); diff --git a/numpy/core/src/multiarray/dragon4.c b/numpy/core/src/multiarray/dragon4.c index 553d0effb..a7b252a77 100644 --- a/numpy/core/src/multiarray/dragon4.c +++ b/numpy/core/src/multiarray/dragon4.c @@ -3093,7 +3093,7 @@ Dragon4_Positional_##Type##_opt(npy_type *val, Dragon4_Options *opt)\ free_dragon4_bigint_scratch(scratch);\ return NULL;\ }\ - ret = PyUString_FromString(scratch->repr);\ + ret = PyUnicode_FromString(scratch->repr);\ free_dragon4_bigint_scratch(scratch);\ return ret;\ }\ @@ -3130,7 +3130,7 @@ Dragon4_Scientific_##Type##_opt(npy_type *val, Dragon4_Options *opt)\ free_dragon4_bigint_scratch(scratch);\ return NULL;\ }\ - ret = PyUString_FromString(scratch->repr);\ + ret = PyUnicode_FromString(scratch->repr);\ free_dragon4_bigint_scratch(scratch);\ return ret;\ }\ diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index 2538e05c6..6ad375f67 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -16,7 +16,6 @@ #define _MULTIARRAYMODULE #include <numpy/npy_common.h> #include <numpy/arrayobject.h> -#include <numpy/halffloat.h> #include <npy_pycompat.h> #include <ctype.h> @@ -25,1898 +24,8 @@ #include "common.h" #include "ctors.h" -#ifdef NPY_HAVE_SSE_INTRINSICS -#define EINSUM_USE_SSE1 1 -#else -#define EINSUM_USE_SSE1 0 -#endif - -#ifdef NPY_HAVE_SSE2_INTRINSICS -#define EINSUM_USE_SSE2 1 -#else -#define EINSUM_USE_SSE2 0 -#endif - -#if EINSUM_USE_SSE1 -#include <xmmintrin.h> -#endif - -#if EINSUM_USE_SSE2 -#include <emmintrin.h> -#endif - -#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0) - -/********** PRINTF DEBUG TRACING **************/ -#define NPY_EINSUM_DBG_TRACING 0 - -#if NPY_EINSUM_DBG_TRACING -#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s); -#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1); -#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2); -#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s); -#else -#define NPY_EINSUM_DBG_PRINT(s) -#define NPY_EINSUM_DBG_PRINT1(s, p1) -#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) -#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) -#endif -/**********************************************/ - -/**begin repeat - * #name = byte, short, int, long, longlong, - * ubyte, ushort, uint, ulong, ulonglong, - * half, float, double, longdouble, - * cfloat, cdouble, clongdouble# - * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, - * npy_half, npy_float, npy_double, npy_longdouble, - * npy_cfloat, npy_cdouble, npy_clongdouble# - * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, - * npy_float, npy_float, npy_double, npy_longdouble, - * npy_float, npy_double, npy_longdouble# - * #to = ,,,,, - * ,,,,, - * npy_float_to_half,,,, - * ,,# - * #from = ,,,,, - * ,,,,, - * npy_half_to_float,,,, - * ,,# - * #complex = 0*5, - * 0*5, - * 0*4, - * 1*3# - * #float32 = 0*5, - * 0*5, - * 0,1,0,0, - * 0*3# - * #float64 = 0*5, - * 0*5, - * 0,0,1,0, - * 0*3# - */ - -/**begin repeat1 - * #nop = 1, 2, 3, 1000# - * #noplabel = one, two, three, any# - */ -static void -@name@_sum_of_products_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) && !@complex@ - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) && !@complex@ - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif -#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) - char *data_out = dataptr[@nop@]; - npy_intp stride_out = strides[@nop@]; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count); - - while (count--) { -#if !@complex@ -# if @nop@ == 1 - *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) + - @from@(*(@type@ *)data_out)); - data0 += stride0; - data_out += stride_out; -# elif @nop@ == 2 - *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1) + - @from@(*(@type@ *)data_out)); - data0 += stride0; - data1 += stride1; - data_out += stride_out; -# elif @nop@ == 3 - *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1) * - @from@(*(@type@ *)data2) + - @from@(*(@type@ *)data_out)); - data0 += stride0; - data1 += stride1; - data2 += stride2; - data_out += stride_out; -# else - @temptype@ temp = @from@(*(@type@ *)dataptr[0]); - int i; - for (i = 1; i < nop; ++i) { - temp *= @from@(*(@type@ *)dataptr[i]); - } - *(@type@ *)dataptr[nop] = @to@(temp + - @from@(*(@type@ *)dataptr[i])); - for (i = 0; i <= nop; ++i) { - dataptr[i] += strides[i]; - } -# endif -#else /* complex */ -# if @nop@ == 1 - ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] + - ((@temptype@ *)data_out)[0]; - ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] + - ((@temptype@ *)data_out)[1]; - data0 += stride0; - data_out += stride_out; -# else -# if @nop@ <= 3 -#define _SUMPROD_NOP @nop@ -# else -#define _SUMPROD_NOP nop -# endif - @temptype@ re, im, tmp; - int i; - re = ((@temptype@ *)dataptr[0])[0]; - im = ((@temptype@ *)dataptr[0])[1]; - for (i = 1; i < _SUMPROD_NOP; ++i) { - tmp = re * ((@temptype@ *)dataptr[i])[0] - - im * ((@temptype@ *)dataptr[i])[1]; - im = re * ((@temptype@ *)dataptr[i])[1] + - im * ((@temptype@ *)dataptr[i])[0]; - re = tmp; - } - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; - - for (i = 0; i <= _SUMPROD_NOP; ++i) { - dataptr[i] += strides[i]; - } -#undef _SUMPROD_NOP -# endif -#endif - } -} - -#if @nop@ == 1 - -static void -@name@_sum_of_products_contig_one(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data_out = (@type@ *)dataptr[1]; - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -#if !@complex@ - data_out[@i@] = @to@(@from@(data0[@i@]) + - @from@(data_out[@i@])); -#else - ((@temptype@ *)data_out + 2*@i@)[0] = - ((@temptype@ *)data0 + 2*@i@)[0] + - ((@temptype@ *)data_out + 2*@i@)[0]; - ((@temptype@ *)data_out + 2*@i@)[1] = - ((@temptype@ *)data0 + 2*@i@)[1] + - ((@temptype@ *)data_out + 2*@i@)[1]; -#endif -/**end repeat2**/ - case 0: - return; - } - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ -#if !@complex@ - data_out[@i@] = @to@(@from@(data0[@i@]) + - @from@(data_out[@i@])); -#else /* complex */ - ((@temptype@ *)data_out + 2*@i@)[0] = - ((@temptype@ *)data0 + 2*@i@)[0] + - ((@temptype@ *)data_out + 2*@i@)[0]; - ((@temptype@ *)data_out + 2*@i@)[1] = - ((@temptype@ *)data0 + 2*@i@)[1] + - ((@temptype@ *)data_out + 2*@i@)[1]; -#endif -/**end repeat2**/ - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -#elif @nop@ == 2 && !@complex@ - -static void -@name@_sum_of_products_contig_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data1 = (@type@ *)dataptr[1]; - @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && - EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && - EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data0 += 8; - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -/* Some extra specializations for the two operand case */ -static void -@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); - @type@ *data1 = (@type@ *)dataptr[1]; - @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b, value0_sse; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b, value0_sse; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(value0 * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - value0_sse = _mm_set_ps1(value0); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } - else { - return; - } - } -#elif EINSUM_USE_SSE2 && @float64@ - value0_sse = _mm_set1_pd(value0); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } - else { - return; - } - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@)); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@)); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(value0 * - @from@(data1[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data1 += 8; - data_out += 8; - } - - /* Finish off the loop */ - if (count > 0) { - goto finish_after_unrolled_loop; - } -} - -static void -@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); - @type@ *data_out = (@type@ *)dataptr[2]; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, b, value1_sse; -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, b, value1_sse; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - data_out[@i@] = @to@(@from@(data0[@i@])* - value1 + - @from@(data_out[@i@])); -/**end repeat2**/ - case 0: - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - value1_sse = _mm_set_ps1(value1); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse); - b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); - _mm_store_ps(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - value1_sse = _mm_set1_pd(value1); - - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse); - b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); - _mm_store_pd(data_out+@i@, b); -/**end repeat2**/ - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse); - b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); - _mm_storeu_ps(data_out+@i@, b); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse); - b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); - _mm_storeu_pd(data_out+@i@, b); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@])* - value1 + - @from@(data_out[@i@])); -/**end repeat2**/ -#endif - data0 += 8; - data_out += 8; - } - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -static void -@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data1 = (@type@ *)dataptr[1]; - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; - } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; - } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ -#endif - data0 += 8; - data1 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -static void -@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); - @type@ *data1 = (@type@ *)dataptr[1]; - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@)); -/**end repeat2**/ - data1 += 8; - } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@)); -/**end repeat2**/ - data1 += 8; - } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data1[@i@]); -/**end repeat2**/ -#endif - data1 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -static void -@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]); -/**end repeat2**/ -#endif - data0 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -#elif @nop@ == 3 && !@complex@ - -static void -@name@_sum_of_products_contig_three(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - @type@ *data0 = (@type@ *)dataptr[0]; - @type@ *data1 = (@type@ *)dataptr[1]; - @type@ *data2 = (@type@ *)dataptr[2]; - @type@ *data_out = (@type@ *)dataptr[3]; - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) * - @from@(data2[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ - data0 += 8; - data1 += 8; - data2 += 8; - data_out += 8; - } - - /* Finish off the loop */ - -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - if (count-- == 0) { - return; - } - data_out[@i@] = @to@(@from@(data0[@i@]) * - @from@(data1[@i@]) * - @from@(data2[@i@]) + - @from@(data_out[@i@])); -/**end repeat2**/ -} - -#else /* @nop@ > 3 || @complex */ - -static void -@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr, - npy_intp const *NPY_UNUSED(strides), npy_intp count) -{ - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n", - (int)count); - - while (count--) { -#if !@complex@ - @temptype@ temp = @from@(*(@type@ *)dataptr[0]); - int i; - for (i = 1; i < nop; ++i) { - temp *= @from@(*(@type@ *)dataptr[i]); - } - *(@type@ *)dataptr[nop] = @to@(temp + - @from@(*(@type@ *)dataptr[i])); - for (i = 0; i <= nop; ++i) { - dataptr[i] += sizeof(@type@); - } -#else /* complex */ -# if @nop@ <= 3 -# define _SUMPROD_NOP @nop@ -# else -# define _SUMPROD_NOP nop -# endif - @temptype@ re, im, tmp; - int i; - re = ((@temptype@ *)dataptr[0])[0]; - im = ((@temptype@ *)dataptr[0])[1]; - for (i = 1; i < _SUMPROD_NOP; ++i) { - tmp = re * ((@temptype@ *)dataptr[i])[0] - - im * ((@temptype@ *)dataptr[i])[1]; - im = re * ((@temptype@ *)dataptr[i])[1] + - im * ((@temptype@ *)dataptr[i])[0]; - re = tmp; - } - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + - ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; - - for (i = 0; i <= _SUMPROD_NOP; ++i) { - dataptr[i] += sizeof(@type@); - } -# undef _SUMPROD_NOP -#endif - } -} - -#endif /* functions for various @nop@ */ - -#if @nop@ == 1 - -static void -@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if @complex@ - @temptype@ accum_re = 0, accum_im = 0; - @temptype@ *data0 = (@temptype@ *)dataptr[0]; -#else - @temptype@ accum = 0; - @type@ *data0 = (@type@ *)dataptr[0]; -#endif - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -#if !@complex@ - accum += @from@(data0[@i@]); -#else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -#endif -/**end repeat2**/ - case 0: -#if @complex@ - ((@temptype@ *)dataptr[1])[0] += accum_re; - ((@temptype@ *)dataptr[1])[1] += accum_im; -#else - *((@type@ *)dataptr[1]) = @to@(accum + - @from@(*((@type@ *)dataptr[1]))); -#endif - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ -# if !@complex@ - accum += @from@(data0[@i@]); -# else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -# endif -/**end repeat2**/ -#endif - -#if !@complex@ - data0 += 8; -#else - data0 += 8*2; -#endif - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; -} - -#endif /* @nop@ == 1 */ - -static void -@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if @complex@ - @temptype@ accum_re = 0, accum_im = 0; -#else - @temptype@ accum = 0; -#endif - -#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) && !@complex@ - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) && !@complex@ - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n", - (int)count); - - while (count--) { -#if !@complex@ -# if @nop@ == 1 - accum += @from@(*(@type@ *)data0); - data0 += stride0; -# elif @nop@ == 2 - accum += @from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1); - data0 += stride0; - data1 += stride1; -# elif @nop@ == 3 - accum += @from@(*(@type@ *)data0) * - @from@(*(@type@ *)data1) * - @from@(*(@type@ *)data2); - data0 += stride0; - data1 += stride1; - data2 += stride2; -# else - @temptype@ temp = @from@(*(@type@ *)dataptr[0]); - int i; - for (i = 1; i < nop; ++i) { - temp *= @from@(*(@type@ *)dataptr[i]); - } - accum += temp; - for (i = 0; i < nop; ++i) { - dataptr[i] += strides[i]; - } -# endif -#else /* complex */ -# if @nop@ == 1 - accum_re += ((@temptype@ *)data0)[0]; - accum_im += ((@temptype@ *)data0)[1]; - data0 += stride0; -# else -# if @nop@ <= 3 -#define _SUMPROD_NOP @nop@ -# else -#define _SUMPROD_NOP nop -# endif - @temptype@ re, im, tmp; - int i; - re = ((@temptype@ *)dataptr[0])[0]; - im = ((@temptype@ *)dataptr[0])[1]; - for (i = 1; i < _SUMPROD_NOP; ++i) { - tmp = re * ((@temptype@ *)dataptr[i])[0] - - im * ((@temptype@ *)dataptr[i])[1]; - im = re * ((@temptype@ *)dataptr[i])[1] + - im * ((@temptype@ *)dataptr[i])[0]; - re = tmp; - } - accum_re += re; - accum_im += im; - for (i = 0; i < _SUMPROD_NOP; ++i) { - dataptr[i] += strides[i]; - } -#undef _SUMPROD_NOP -# endif -#endif - } - -#if @complex@ -# if @nop@ <= 3 - ((@temptype@ *)dataptr[@nop@])[0] += accum_re; - ((@temptype@ *)dataptr[@nop@])[1] += accum_im; -# else - ((@temptype@ *)dataptr[nop])[0] += accum_re; - ((@temptype@ *)dataptr[nop])[1] += accum_im; -# endif -#else -# if @nop@ <= 3 - *((@type@ *)dataptr[@nop@]) = @to@(accum + - @from@(*((@type@ *)dataptr[@nop@]))); -# else - *((@type@ *)dataptr[nop]) = @to@(accum + - @from@(*((@type@ *)dataptr[nop]))); -# endif -#endif - -} - -/**end repeat1**/ - -/**end repeat**/ - - -/* Do OR of ANDs for the boolean type */ - -/**begin repeat - * #nop = 1, 2, 3, 1000# - * #noplabel = one, two, three, any# - */ - -static void -bool_sum_of_products_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if (@nop@ <= 3) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif -#if (@nop@ <= 3) - char *data_out = dataptr[@nop@]; - npy_intp stride_out = strides[@nop@]; -#endif - - while (count--) { -#if @nop@ == 1 - *(npy_bool *)data_out = *(npy_bool *)data0 || - *(npy_bool *)data_out; - data0 += stride0; - data_out += stride_out; -#elif @nop@ == 2 - *(npy_bool *)data_out = (*(npy_bool *)data0 && - *(npy_bool *)data1) || - *(npy_bool *)data_out; - data0 += stride0; - data1 += stride1; - data_out += stride_out; -#elif @nop@ == 3 - *(npy_bool *)data_out = (*(npy_bool *)data0 && - *(npy_bool *)data1 && - *(npy_bool *)data2) || - *(npy_bool *)data_out; - data0 += stride0; - data1 += stride1; - data2 += stride2; - data_out += stride_out; -#else - npy_bool temp = *(npy_bool *)dataptr[0]; - int i; - for (i = 1; i < nop; ++i) { - temp = temp && *(npy_bool *)dataptr[i]; - } - *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; - for (i = 0; i <= nop; ++i) { - dataptr[i] += strides[i]; - } -#endif - } -} - -static void -bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ -#if (@nop@ <= 3) - char *data0 = dataptr[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) - char *data1 = dataptr[1]; -#endif -#if (@nop@ == 3) - char *data2 = dataptr[2]; -#endif -#if (@nop@ <= 3) - char *data_out = dataptr[@nop@]; -#endif - -#if (@nop@ <= 3) -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat1 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: -# if @nop@ == 1 - ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] || - ((npy_bool *)data_out)[@i@]; -# elif @nop@ == 2 - ((npy_bool *)data_out)[@i@] = - (((npy_bool *)data0)[@i@] && - ((npy_bool *)data1)[@i@]) || - ((npy_bool *)data_out)[@i@]; -# elif @nop@ == 3 - ((npy_bool *)data_out)[@i@] = - (((npy_bool *)data0)[@i@] && - ((npy_bool *)data1)[@i@] && - ((npy_bool *)data2)[@i@]) || - ((npy_bool *)data_out)[@i@]; -# endif -/**end repeat1**/ - case 0: - return; - } -#endif - -/* Unroll the loop by 8 for fixed-size nop */ -#if (@nop@ <= 3) - while (count >= 8) { - count -= 8; -#else - while (count--) { -#endif - -# if @nop@ == 1 -/**begin repeat1 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) || - (*((npy_bool *)data_out + @i@)); -/**end repeat1**/ - data0 += 8*sizeof(npy_bool); - data_out += 8*sizeof(npy_bool); -# elif @nop@ == 2 -/**begin repeat1 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - *((npy_bool *)data_out + @i@) = - ((*((npy_bool *)data0 + @i@)) && - (*((npy_bool *)data1 + @i@))) || - (*((npy_bool *)data_out + @i@)); -/**end repeat1**/ - data0 += 8*sizeof(npy_bool); - data1 += 8*sizeof(npy_bool); - data_out += 8*sizeof(npy_bool); -# elif @nop@ == 3 -/**begin repeat1 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - *((npy_bool *)data_out + @i@) = - ((*((npy_bool *)data0 + @i@)) && - (*((npy_bool *)data1 + @i@)) && - (*((npy_bool *)data2 + @i@))) || - (*((npy_bool *)data_out + @i@)); -/**end repeat1**/ - data0 += 8*sizeof(npy_bool); - data1 += 8*sizeof(npy_bool); - data2 += 8*sizeof(npy_bool); - data_out += 8*sizeof(npy_bool); -# else - npy_bool temp = *(npy_bool *)dataptr[0]; - int i; - for (i = 1; i < nop; ++i) { - temp = temp && *(npy_bool *)dataptr[i]; - } - *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; - for (i = 0; i <= nop; ++i) { - dataptr[i] += sizeof(npy_bool); - } -# endif - } - - /* If the loop was unrolled, we need to finish it off */ -#if (@nop@ <= 3) - goto finish_after_unrolled_loop; -#endif -} - -static void -bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, - npy_intp const *strides, npy_intp count) -{ - npy_bool accum = 0; - -#if (@nop@ <= 3) - char *data0 = dataptr[0]; - npy_intp stride0 = strides[0]; -#endif -#if (@nop@ == 2 || @nop@ == 3) - char *data1 = dataptr[1]; - npy_intp stride1 = strides[1]; -#endif -#if (@nop@ == 3) - char *data2 = dataptr[2]; - npy_intp stride2 = strides[2]; -#endif - - while (count--) { -#if @nop@ == 1 - accum = *(npy_bool *)data0 || accum; - data0 += stride0; -#elif @nop@ == 2 - accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum; - data0 += stride0; - data1 += stride1; -#elif @nop@ == 3 - accum = (*(npy_bool *)data0 && - *(npy_bool *)data1 && - *(npy_bool *)data2) || accum; - data0 += stride0; - data1 += stride1; - data2 += stride2; -#else - npy_bool temp = *(npy_bool *)dataptr[0]; - int i; - for (i = 1; i < nop; ++i) { - temp = temp && *(npy_bool *)dataptr[i]; - } - accum = temp || accum; - for (i = 0; i <= nop; ++i) { - dataptr[i] += strides[i]; - } -#endif - } - -# if @nop@ <= 3 - *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]); -# else - *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]); -# endif -} - -/**end repeat**/ - -typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); - -/* These tables need to match up with the type enum */ -static sum_of_products_fn -_contig_outstride0_unary_specialization_table[NPY_NTYPES] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 0, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ - &@name@_sum_of_products_contig_outstride0_one, -#else - NULL, -#endif -/**end repeat**/ -}; /* End of _contig_outstride0_unary_specialization_table */ - -static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 0, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 0, 0, 0, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_stride0_contig_outstride0_two, - &@name@_sum_of_products_stride0_contig_outcontig_two, - &@name@_sum_of_products_contig_stride0_outstride0_two, - &@name@_sum_of_products_contig_stride0_outcontig_two, - &@name@_sum_of_products_contig_contig_outstride0_two, -}, -#else - {NULL, NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _binary_specialization_table */ - -static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_outstride0_any, - &@name@_sum_of_products_outstride0_one, - &@name@_sum_of_products_outstride0_two, - &@name@_sum_of_products_outstride0_three -}, -#else - {NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _outstride0_specialized_table */ - -static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_contig_any, - &@name@_sum_of_products_contig_one, - &@name@_sum_of_products_contig_two, - &@name@_sum_of_products_contig_three -}, -#else - {NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _allcontig_specialized_table */ - -static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = { -/**begin repeat - * #name = bool, - * byte, ubyte, - * short, ushort, - * int, uint, - * long, ulong, - * longlong, ulonglong, - * float, double, longdouble, - * cfloat, cdouble, clongdouble, - * object, string, unicode, void, - * datetime, timedelta, half# - * #use = 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, - * 1, 1, 1, - * 1, 1, 1, - * 0, 0, 0, 0, - * 0, 0, 1# - */ -#if @use@ -{ - &@name@_sum_of_products_any, - &@name@_sum_of_products_one, - &@name@_sum_of_products_two, - &@name@_sum_of_products_three -}, -#else - {NULL, NULL, NULL, NULL}, -#endif -/**end repeat**/ -}; /* End of _unnspecialized_table */ - -static sum_of_products_fn -get_sum_of_products_function(int nop, int type_num, - npy_intp itemsize, npy_intp const *fixed_strides) -{ - int iop; - - if (type_num >= NPY_NTYPES) { - return NULL; - } - - /* contiguous reduction */ - if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) { - sum_of_products_fn ret = - _contig_outstride0_unary_specialization_table[type_num]; - if (ret != NULL) { - return ret; - } - } - - /* nop of 2 has more specializations */ - if (nop == 2) { - /* Encode the zero/contiguous strides */ - int code; - code = (fixed_strides[0] == 0) ? 0 : - (fixed_strides[0] == itemsize) ? 2*2*1 : 8; - code += (fixed_strides[1] == 0) ? 0 : - (fixed_strides[1] == itemsize) ? 2*1 : 8; - code += (fixed_strides[2] == 0) ? 0 : - (fixed_strides[2] == itemsize) ? 1 : 8; - if (code >= 2 && code < 7) { - sum_of_products_fn ret = - _binary_specialization_table[type_num][code-2]; - if (ret != NULL) { - return ret; - } - } - } - - /* Inner loop with an output stride of 0 */ - if (fixed_strides[nop] == 0) { - return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0]; - } - - /* Check for all contiguous */ - for (iop = 0; iop < nop + 1; ++iop) { - if (fixed_strides[iop] != itemsize) { - break; - } - } - - /* Contiguous loop */ - if (iop == nop + 1) { - return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0]; - } - - /* None of the above specializations caught it, general loops */ - return _unspecialized_table[type_num][nop <= 3 ? nop : 0]; -} +#include "einsum_sumprod.h" +#include "einsum_debug.h" /* diff --git a/numpy/core/src/multiarray/einsum_debug.h b/numpy/core/src/multiarray/einsum_debug.h new file mode 100644 index 000000000..9aa81fcbd --- /dev/null +++ b/numpy/core/src/multiarray/einsum_debug.h @@ -0,0 +1,28 @@ +/* + * This file provides debug macros used by the other einsum files. + * + * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com) + * The University of British Columbia + * + * See LICENSE.txt for the license. + */ +#ifndef _NPY_MULTIARRAY_EINSUM_DEBUG_H +#define _NPY_MULTIARRAY_EINSUM_DEBUG_H + +/********** PRINTF DEBUG TRACING **************/ +#define NPY_EINSUM_DBG_TRACING 0 + +#if NPY_EINSUM_DBG_TRACING +#include <cstdio> +#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s); +#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1); +#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2); +#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s); +#else +#define NPY_EINSUM_DBG_PRINT(s) +#define NPY_EINSUM_DBG_PRINT1(s, p1) +#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) +#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) +#endif + +#endif diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src new file mode 100644 index 000000000..c58e74287 --- /dev/null +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -0,0 +1,1897 @@ +/* + * This file provides optimized sum of product implementations used internally + * by einsum. + * + * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com) + * The University of British Columbia + * + * See LICENSE.txt for the license. + */ + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE + +#include <numpy/npy_common.h> +#include <numpy/ndarraytypes.h> /* for NPY_NTYPES */ +#include <numpy/halffloat.h> + +#include "einsum_sumprod.h" +#include "einsum_debug.h" + + +#ifdef NPY_HAVE_SSE_INTRINSICS +#define EINSUM_USE_SSE1 1 +#else +#define EINSUM_USE_SSE1 0 +#endif + +#ifdef NPY_HAVE_SSE2_INTRINSICS +#define EINSUM_USE_SSE2 1 +#else +#define EINSUM_USE_SSE2 0 +#endif + +#if EINSUM_USE_SSE1 +#include <xmmintrin.h> +#endif + +#if EINSUM_USE_SSE2 +#include <emmintrin.h> +#endif + +#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0) + +/**********************************************/ + +/**begin repeat + * #name = byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble, + * cfloat, cdouble, clongdouble# + * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_half, npy_float, npy_double, npy_longdouble, + * npy_cfloat, npy_cdouble, npy_clongdouble# + * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_float, npy_float, npy_double, npy_longdouble, + * npy_float, npy_double, npy_longdouble# + * #to = ,,,,, + * ,,,,, + * npy_float_to_half,,,, + * ,,# + * #from = ,,,,, + * ,,,,, + * npy_half_to_float,,,, + * ,,# + * #complex = 0*5, + * 0*5, + * 0*4, + * 1*3# + * #float32 = 0*5, + * 0*5, + * 0,1,0,0, + * 0*3# + * #float64 = 0*5, + * 0*5, + * 0,0,1,0, + * 0*3# + */ + +/**begin repeat1 + * #nop = 1, 2, 3, 1000# + * #noplabel = one, two, three, any# + */ +static void +@name@_sum_of_products_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) && !@complex@ + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) && !@complex@ + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif +#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) + char *data_out = dataptr[@nop@]; + npy_intp stride_out = strides[@nop@]; +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count); + + while (count--) { +#if !@complex@ +# if @nop@ == 1 + *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) + + @from@(*(@type@ *)data_out)); + data0 += stride0; + data_out += stride_out; +# elif @nop@ == 2 + *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * + @from@(*(@type@ *)data1) + + @from@(*(@type@ *)data_out)); + data0 += stride0; + data1 += stride1; + data_out += stride_out; +# elif @nop@ == 3 + *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) * + @from@(*(@type@ *)data1) * + @from@(*(@type@ *)data2) + + @from@(*(@type@ *)data_out)); + data0 += stride0; + data1 += stride1; + data2 += stride2; + data_out += stride_out; +# else + @temptype@ temp = @from@(*(@type@ *)dataptr[0]); + int i; + for (i = 1; i < nop; ++i) { + temp *= @from@(*(@type@ *)dataptr[i]); + } + *(@type@ *)dataptr[nop] = @to@(temp + + @from@(*(@type@ *)dataptr[i])); + for (i = 0; i <= nop; ++i) { + dataptr[i] += strides[i]; + } +# endif +#else /* complex */ +# if @nop@ == 1 + ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] + + ((@temptype@ *)data_out)[0]; + ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] + + ((@temptype@ *)data_out)[1]; + data0 += stride0; + data_out += stride_out; +# else +# if @nop@ <= 3 +#define _SUMPROD_NOP @nop@ +# else +#define _SUMPROD_NOP nop +# endif + @temptype@ re, im, tmp; + int i; + re = ((@temptype@ *)dataptr[0])[0]; + im = ((@temptype@ *)dataptr[0])[1]; + for (i = 1; i < _SUMPROD_NOP; ++i) { + tmp = re * ((@temptype@ *)dataptr[i])[0] - + im * ((@temptype@ *)dataptr[i])[1]; + im = re * ((@temptype@ *)dataptr[i])[1] + + im * ((@temptype@ *)dataptr[i])[0]; + re = tmp; + } + ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + + ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; + ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + + ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; + + for (i = 0; i <= _SUMPROD_NOP; ++i) { + dataptr[i] += strides[i]; + } +#undef _SUMPROD_NOP +# endif +#endif + } +} + +#if @nop@ == 1 + +static void +@name@_sum_of_products_contig_one(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @type@ *data_out = (@type@ *)dataptr[1]; + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n", + (int)count); + +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat2 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: +#if !@complex@ + data_out[@i@] = @to@(@from@(data0[@i@]) + + @from@(data_out[@i@])); +#else + ((@temptype@ *)data_out + 2*@i@)[0] = + ((@temptype@ *)data0 + 2*@i@)[0] + + ((@temptype@ *)data_out + 2*@i@)[0]; + ((@temptype@ *)data_out + 2*@i@)[1] = + ((@temptype@ *)data0 + 2*@i@)[1] + + ((@temptype@ *)data_out + 2*@i@)[1]; +#endif +/**end repeat2**/ + case 0: + return; + } + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ +#if !@complex@ + data_out[@i@] = @to@(@from@(data0[@i@]) + + @from@(data_out[@i@])); +#else /* complex */ + ((@temptype@ *)data_out + 2*@i@)[0] = + ((@temptype@ *)data0 + 2*@i@)[0] + + ((@temptype@ *)data_out + 2*@i@)[0]; + ((@temptype@ *)data_out + 2*@i@)[1] = + ((@temptype@ *)data0 + 2*@i@)[1] + + ((@temptype@ *)data_out + 2*@i@)[1]; +#endif +/**end repeat2**/ + data0 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; +} + +#elif @nop@ == 2 && !@complex@ + +static void +@name@_sum_of_products_contig_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @type@ *data1 = (@type@ *)dataptr[1]; + @type@ *data_out = (@type@ *)dataptr[2]; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, b; +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, b; +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n", + (int)count); + +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat2 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ + case 0: + return; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && + EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 4# + */ + a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); + b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); + _mm_store_ps(data_out+@i@, b); +/**end repeat2**/ + data0 += 8; + data1 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && + EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); + b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); + _mm_store_pd(data_out+@i@, b); +/**end repeat2**/ + data0 += 8; + data1 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#endif + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4# + */ + a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); + b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); + _mm_storeu_ps(data_out+@i@, b); +/**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); + b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); + _mm_storeu_pd(data_out+@i@, b); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ +#endif + data0 += 8; + data1 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; +} + +/* Some extra specializations for the two operand case */ +static void +@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); + @type@ *data1 = (@type@ *)dataptr[1]; + @type@ *data_out = (@type@ *)dataptr[2]; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, b, value0_sse; +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, b, value0_sse; +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n", + (int)count); + +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat2 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: + data_out[@i@] = @to@(value0 * + @from@(data1[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ + case 0: + return; + } + +#if EINSUM_USE_SSE1 && @float32@ + value0_sse = _mm_set_ps1(value0); + + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 4# + */ + a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@)); + b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); + _mm_store_ps(data_out+@i@, b); +/**end repeat2**/ + data1 += 8; + data_out += 8; + } + + /* Finish off the loop */ + if (count > 0) { + goto finish_after_unrolled_loop; + } + else { + return; + } + } +#elif EINSUM_USE_SSE2 && @float64@ + value0_sse = _mm_set1_pd(value0); + + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@)); + b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); + _mm_store_pd(data_out+@i@, b); +/**end repeat2**/ + data1 += 8; + data_out += 8; + } + + /* Finish off the loop */ + if (count > 0) { + goto finish_after_unrolled_loop; + } + else { + return; + } + } +#endif + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4# + */ + a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@)); + b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); + _mm_storeu_ps(data_out+@i@, b); +/**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@)); + b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); + _mm_storeu_pd(data_out+@i@, b); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + data_out[@i@] = @to@(value0 * + @from@(data1[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ +#endif + data1 += 8; + data_out += 8; + } + + /* Finish off the loop */ + if (count > 0) { + goto finish_after_unrolled_loop; + } +} + +static void +@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); + @type@ *data_out = (@type@ *)dataptr[2]; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, b, value1_sse; +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, b, value1_sse; +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n", + (int)count); + +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat2 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: + data_out[@i@] = @to@(@from@(data0[@i@])* + value1 + + @from@(data_out[@i@])); +/**end repeat2**/ + case 0: + return; + } + +#if EINSUM_USE_SSE1 && @float32@ + value1_sse = _mm_set_ps1(value1); + + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 4# + */ + a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse); + b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); + _mm_store_ps(data_out+@i@, b); +/**end repeat2**/ + data0 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + value1_sse = _mm_set1_pd(value1); + + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse); + b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); + _mm_store_pd(data_out+@i@, b); +/**end repeat2**/ + data0 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#endif + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4# + */ + a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse); + b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); + _mm_storeu_ps(data_out+@i@, b); +/**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse); + b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); + _mm_storeu_pd(data_out+@i@, b); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + data_out[@i@] = @to@(@from@(data0[@i@])* + value1 + + @from@(data_out[@i@])); +/**end repeat2**/ +#endif + data0 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; +} + +static void +@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @type@ *data1 = (@type@ *)dataptr[1]; + @temptype@ accum = 0; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n", + (int)count); + +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat2 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: + accum += @from@(data0[@i@]) * @from@(data1[@i@]); +/**end repeat2**/ + case 0: + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); + return; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + + _mm_prefetch(data0 + 512, _MM_HINT_T0); + _mm_prefetch(data1 + 512, _MM_HINT_T0); + +/**begin repeat2 + * #i = 0, 4# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); + accum_sse = _mm_add_ps(accum_sse, a); +/**end repeat2**/ + data0 += 8; + data1 += 8; + } + + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + + _mm_prefetch(data0 + 512, _MM_HINT_T0); + _mm_prefetch(data1 + 512, _MM_HINT_T0); + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); + accum_sse = _mm_add_pd(accum_sse, a); +/**end repeat2**/ + data0 += 8; + data1 += 8; + } + + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#endif + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +#if EINSUM_USE_SSE1 && @float32@ + _mm_prefetch(data0 + 512, _MM_HINT_T0); + _mm_prefetch(data1 + 512, _MM_HINT_T0); + +/**begin repeat2 + * #i = 0, 4# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); + accum_sse = _mm_add_ps(accum_sse, a); +/**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ + _mm_prefetch(data0 + 512, _MM_HINT_T0); + _mm_prefetch(data1 + 512, _MM_HINT_T0); + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); + accum_sse = _mm_add_pd(accum_sse, a); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + accum += @from@(data0[@i@]) * @from@(data1[@i@]); +/**end repeat2**/ +#endif + data0 += 8; + data1 += 8; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); +#endif + + /* Finish off the loop */ + goto finish_after_unrolled_loop; +} + +static void +@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); + @type@ *data1 = (@type@ *)dataptr[1]; + @temptype@ accum = 0; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", + (int)count); + +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat2 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: + accum += @from@(data1[@i@]); +/**end repeat2**/ + case 0: + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum); + return; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data1)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 4# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@)); +/**end repeat2**/ + data1 += 8; + } + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data1)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@)); +/**end repeat2**/ + data1 += 8; + } + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#endif + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); +/**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@)); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + accum += @from@(data1[@i@]); +/**end repeat2**/ +#endif + data1 += 8; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); +#endif + + /* Finish off the loop */ + goto finish_after_unrolled_loop; +} + +static void +@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); + @temptype@ accum = 0; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", + (int)count); + +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat2 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: + accum += @from@(data0[@i@]); +/**end repeat2**/ + case 0: + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1); + return; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 4# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); +/**end repeat2**/ + data0 += 8; + } + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); +/**end repeat2**/ + data0 += 8; + } + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#endif + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); +/**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + accum += @from@(data0[@i@]); +/**end repeat2**/ +#endif + data0 += 8; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); +#endif + + /* Finish off the loop */ + goto finish_after_unrolled_loop; +} + +#elif @nop@ == 3 && !@complex@ + +static void +@name@_sum_of_products_contig_three(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + @type@ *data0 = (@type@ *)dataptr[0]; + @type@ *data1 = (@type@ *)dataptr[1]; + @type@ *data2 = (@type@ *)dataptr[2]; + @type@ *data_out = (@type@ *)dataptr[3]; + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) * + @from@(data2[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ + data0 += 8; + data1 += 8; + data2 += 8; + data_out += 8; + } + + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + if (count-- == 0) { + return; + } + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) * + @from@(data2[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ +} + +#else /* @nop@ > 3 || @complex */ + +static void +@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr, + npy_intp const *NPY_UNUSED(strides), npy_intp count) +{ + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n", + (int)count); + + while (count--) { +#if !@complex@ + @temptype@ temp = @from@(*(@type@ *)dataptr[0]); + int i; + for (i = 1; i < nop; ++i) { + temp *= @from@(*(@type@ *)dataptr[i]); + } + *(@type@ *)dataptr[nop] = @to@(temp + + @from@(*(@type@ *)dataptr[i])); + for (i = 0; i <= nop; ++i) { + dataptr[i] += sizeof(@type@); + } +#else /* complex */ +# if @nop@ <= 3 +# define _SUMPROD_NOP @nop@ +# else +# define _SUMPROD_NOP nop +# endif + @temptype@ re, im, tmp; + int i; + re = ((@temptype@ *)dataptr[0])[0]; + im = ((@temptype@ *)dataptr[0])[1]; + for (i = 1; i < _SUMPROD_NOP; ++i) { + tmp = re * ((@temptype@ *)dataptr[i])[0] - + im * ((@temptype@ *)dataptr[i])[1]; + im = re * ((@temptype@ *)dataptr[i])[1] + + im * ((@temptype@ *)dataptr[i])[0]; + re = tmp; + } + ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re + + ((@temptype@ *)dataptr[_SUMPROD_NOP])[0]; + ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im + + ((@temptype@ *)dataptr[_SUMPROD_NOP])[1]; + + for (i = 0; i <= _SUMPROD_NOP; ++i) { + dataptr[i] += sizeof(@type@); + } +# undef _SUMPROD_NOP +#endif + } +} + +#endif /* functions for various @nop@ */ + +#if @nop@ == 1 + +static void +@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if @complex@ + @temptype@ accum_re = 0, accum_im = 0; + @temptype@ *data0 = (@temptype@ *)dataptr[0]; +#else + @temptype@ accum = 0; + @type@ *data0 = (@type@ *)dataptr[0]; +#endif + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); +#endif + + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", + (int)count); + +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat2 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: +#if !@complex@ + accum += @from@(data0[@i@]); +#else /* complex */ + accum_re += data0[2*@i@+0]; + accum_im += data0[2*@i@+1]; +#endif +/**end repeat2**/ + case 0: +#if @complex@ + ((@temptype@ *)dataptr[1])[0] += accum_re; + ((@temptype@ *)dataptr[1])[1] += accum_im; +#else + *((@type@ *)dataptr[1]) = @to@(accum + + @from@(*((@type@ *)dataptr[1]))); +#endif + return; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + + _mm_prefetch(data0 + 512, _MM_HINT_T0); + +/**begin repeat2 + * #i = 0, 4# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); +/**end repeat2**/ + data0 += 8; + } + + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + + _mm_prefetch(data0 + 512, _MM_HINT_T0); + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); +/**end repeat2**/ + data0 += 8; + } + + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#endif + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +#if EINSUM_USE_SSE1 && @float32@ + _mm_prefetch(data0 + 512, _MM_HINT_T0); + +/**begin repeat2 + * #i = 0, 4# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); +/**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ + _mm_prefetch(data0 + 512, _MM_HINT_T0); + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ +# if !@complex@ + accum += @from@(data0[@i@]); +# else /* complex */ + accum_re += data0[2*@i@+0]; + accum_im += data0[2*@i@+1]; +# endif +/**end repeat2**/ +#endif + +#if !@complex@ + data0 += 8; +#else + data0 += 8*2; +#endif + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); +#endif + + /* Finish off the loop */ + goto finish_after_unrolled_loop; +} + +#endif /* @nop@ == 1 */ + +static void +@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if @complex@ + @temptype@ accum_re = 0, accum_im = 0; +#else + @temptype@ accum = 0; +#endif + +#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) && !@complex@ + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) && !@complex@ + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif + + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n", + (int)count); + + while (count--) { +#if !@complex@ +# if @nop@ == 1 + accum += @from@(*(@type@ *)data0); + data0 += stride0; +# elif @nop@ == 2 + accum += @from@(*(@type@ *)data0) * + @from@(*(@type@ *)data1); + data0 += stride0; + data1 += stride1; +# elif @nop@ == 3 + accum += @from@(*(@type@ *)data0) * + @from@(*(@type@ *)data1) * + @from@(*(@type@ *)data2); + data0 += stride0; + data1 += stride1; + data2 += stride2; +# else + @temptype@ temp = @from@(*(@type@ *)dataptr[0]); + int i; + for (i = 1; i < nop; ++i) { + temp *= @from@(*(@type@ *)dataptr[i]); + } + accum += temp; + for (i = 0; i < nop; ++i) { + dataptr[i] += strides[i]; + } +# endif +#else /* complex */ +# if @nop@ == 1 + accum_re += ((@temptype@ *)data0)[0]; + accum_im += ((@temptype@ *)data0)[1]; + data0 += stride0; +# else +# if @nop@ <= 3 +#define _SUMPROD_NOP @nop@ +# else +#define _SUMPROD_NOP nop +# endif + @temptype@ re, im, tmp; + int i; + re = ((@temptype@ *)dataptr[0])[0]; + im = ((@temptype@ *)dataptr[0])[1]; + for (i = 1; i < _SUMPROD_NOP; ++i) { + tmp = re * ((@temptype@ *)dataptr[i])[0] - + im * ((@temptype@ *)dataptr[i])[1]; + im = re * ((@temptype@ *)dataptr[i])[1] + + im * ((@temptype@ *)dataptr[i])[0]; + re = tmp; + } + accum_re += re; + accum_im += im; + for (i = 0; i < _SUMPROD_NOP; ++i) { + dataptr[i] += strides[i]; + } +#undef _SUMPROD_NOP +# endif +#endif + } + +#if @complex@ +# if @nop@ <= 3 + ((@temptype@ *)dataptr[@nop@])[0] += accum_re; + ((@temptype@ *)dataptr[@nop@])[1] += accum_im; +# else + ((@temptype@ *)dataptr[nop])[0] += accum_re; + ((@temptype@ *)dataptr[nop])[1] += accum_im; +# endif +#else +# if @nop@ <= 3 + *((@type@ *)dataptr[@nop@]) = @to@(accum + + @from@(*((@type@ *)dataptr[@nop@]))); +# else + *((@type@ *)dataptr[nop]) = @to@(accum + + @from@(*((@type@ *)dataptr[nop]))); +# endif +#endif + +} + +/**end repeat1**/ + +/**end repeat**/ + + +/* Do OR of ANDs for the boolean type */ + +/**begin repeat + * #nop = 1, 2, 3, 1000# + * #noplabel = one, two, three, any# + */ + +static void +bool_sum_of_products_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if (@nop@ <= 3) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif +#if (@nop@ <= 3) + char *data_out = dataptr[@nop@]; + npy_intp stride_out = strides[@nop@]; +#endif + + while (count--) { +#if @nop@ == 1 + *(npy_bool *)data_out = *(npy_bool *)data0 || + *(npy_bool *)data_out; + data0 += stride0; + data_out += stride_out; +#elif @nop@ == 2 + *(npy_bool *)data_out = (*(npy_bool *)data0 && + *(npy_bool *)data1) || + *(npy_bool *)data_out; + data0 += stride0; + data1 += stride1; + data_out += stride_out; +#elif @nop@ == 3 + *(npy_bool *)data_out = (*(npy_bool *)data0 && + *(npy_bool *)data1 && + *(npy_bool *)data2) || + *(npy_bool *)data_out; + data0 += stride0; + data1 += stride1; + data2 += stride2; + data_out += stride_out; +#else + npy_bool temp = *(npy_bool *)dataptr[0]; + int i; + for (i = 1; i < nop; ++i) { + temp = temp && *(npy_bool *)dataptr[i]; + } + *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; + for (i = 0; i <= nop; ++i) { + dataptr[i] += strides[i]; + } +#endif + } +} + +static void +bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ +#if (@nop@ <= 3) + char *data0 = dataptr[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) + char *data1 = dataptr[1]; +#endif +#if (@nop@ == 3) + char *data2 = dataptr[2]; +#endif +#if (@nop@ <= 3) + char *data_out = dataptr[@nop@]; +#endif + +#if (@nop@ <= 3) +/* This is placed before the main loop to make small counts faster */ +finish_after_unrolled_loop: + switch (count) { +/**begin repeat1 + * #i = 6, 5, 4, 3, 2, 1, 0# + */ + case @i@+1: +# if @nop@ == 1 + ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] || + ((npy_bool *)data_out)[@i@]; +# elif @nop@ == 2 + ((npy_bool *)data_out)[@i@] = + (((npy_bool *)data0)[@i@] && + ((npy_bool *)data1)[@i@]) || + ((npy_bool *)data_out)[@i@]; +# elif @nop@ == 3 + ((npy_bool *)data_out)[@i@] = + (((npy_bool *)data0)[@i@] && + ((npy_bool *)data1)[@i@] && + ((npy_bool *)data2)[@i@]) || + ((npy_bool *)data_out)[@i@]; +# endif +/**end repeat1**/ + case 0: + return; + } +#endif + +/* Unroll the loop by 8 for fixed-size nop */ +#if (@nop@ <= 3) + while (count >= 8) { + count -= 8; +#else + while (count--) { +#endif + +# if @nop@ == 1 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 8*sizeof(npy_bool); + data_out += 8*sizeof(npy_bool); +# elif @nop@ == 2 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + *((npy_bool *)data_out + @i@) = + ((*((npy_bool *)data0 + @i@)) && + (*((npy_bool *)data1 + @i@))) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 8*sizeof(npy_bool); + data1 += 8*sizeof(npy_bool); + data_out += 8*sizeof(npy_bool); +# elif @nop@ == 3 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + *((npy_bool *)data_out + @i@) = + ((*((npy_bool *)data0 + @i@)) && + (*((npy_bool *)data1 + @i@)) && + (*((npy_bool *)data2 + @i@))) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 8*sizeof(npy_bool); + data1 += 8*sizeof(npy_bool); + data2 += 8*sizeof(npy_bool); + data_out += 8*sizeof(npy_bool); +# else + npy_bool temp = *(npy_bool *)dataptr[0]; + int i; + for (i = 1; i < nop; ++i) { + temp = temp && *(npy_bool *)dataptr[i]; + } + *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; + for (i = 0; i <= nop; ++i) { + dataptr[i] += sizeof(npy_bool); + } +# endif + } + + /* If the loop was unrolled, we need to finish it off */ +#if (@nop@ <= 3) + goto finish_after_unrolled_loop; +#endif +} + +static void +bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, + npy_intp const *strides, npy_intp count) +{ + npy_bool accum = 0; + +#if (@nop@ <= 3) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif + + while (count--) { +#if @nop@ == 1 + accum = *(npy_bool *)data0 || accum; + data0 += stride0; +#elif @nop@ == 2 + accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum; + data0 += stride0; + data1 += stride1; +#elif @nop@ == 3 + accum = (*(npy_bool *)data0 && + *(npy_bool *)data1 && + *(npy_bool *)data2) || accum; + data0 += stride0; + data1 += stride1; + data2 += stride2; +#else + npy_bool temp = *(npy_bool *)dataptr[0]; + int i; + for (i = 1; i < nop; ++i) { + temp = temp && *(npy_bool *)dataptr[i]; + } + accum = temp || accum; + for (i = 0; i <= nop; ++i) { + dataptr[i] += strides[i]; + } +#endif + } + +# if @nop@ <= 3 + *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]); +# else + *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]); +# endif +} + +/**end repeat**/ + +/* These tables need to match up with the type enum */ +static sum_of_products_fn +_contig_outstride0_unary_specialization_table[NPY_NTYPES] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 0, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 1, 1, 1, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ + &@name@_sum_of_products_contig_outstride0_one, +#else + NULL, +#endif +/**end repeat**/ +}; /* End of _contig_outstride0_unary_specialization_table */ + +static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 0, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 0, 0, 0, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ +{ + &@name@_sum_of_products_stride0_contig_outstride0_two, + &@name@_sum_of_products_stride0_contig_outcontig_two, + &@name@_sum_of_products_contig_stride0_outstride0_two, + &@name@_sum_of_products_contig_stride0_outcontig_two, + &@name@_sum_of_products_contig_contig_outstride0_two, +}, +#else + {NULL, NULL, NULL, NULL, NULL}, +#endif +/**end repeat**/ +}; /* End of _binary_specialization_table */ + +static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 1, 1, 1, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ +{ + &@name@_sum_of_products_outstride0_any, + &@name@_sum_of_products_outstride0_one, + &@name@_sum_of_products_outstride0_two, + &@name@_sum_of_products_outstride0_three +}, +#else + {NULL, NULL, NULL, NULL}, +#endif +/**end repeat**/ +}; /* End of _outstride0_specialized_table */ + +static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 1, 1, 1, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ +{ + &@name@_sum_of_products_contig_any, + &@name@_sum_of_products_contig_one, + &@name@_sum_of_products_contig_two, + &@name@_sum_of_products_contig_three +}, +#else + {NULL, NULL, NULL, NULL}, +#endif +/**end repeat**/ +}; /* End of _allcontig_specialized_table */ + +static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = { +/**begin repeat + * #name = bool, + * byte, ubyte, + * short, ushort, + * int, uint, + * long, ulong, + * longlong, ulonglong, + * float, double, longdouble, + * cfloat, cdouble, clongdouble, + * object, string, unicode, void, + * datetime, timedelta, half# + * #use = 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, + * 1, 1, 1, + * 1, 1, 1, + * 0, 0, 0, 0, + * 0, 0, 1# + */ +#if @use@ +{ + &@name@_sum_of_products_any, + &@name@_sum_of_products_one, + &@name@_sum_of_products_two, + &@name@_sum_of_products_three +}, +#else + {NULL, NULL, NULL, NULL}, +#endif +/**end repeat**/ +}; /* End of _unnspecialized_table */ + +NPY_VISIBILITY_HIDDEN sum_of_products_fn +get_sum_of_products_function(int nop, int type_num, + npy_intp itemsize, npy_intp const *fixed_strides) +{ + int iop; + + if (type_num >= NPY_NTYPES) { + return NULL; + } + + /* contiguous reduction */ + if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) { + sum_of_products_fn ret = + _contig_outstride0_unary_specialization_table[type_num]; + if (ret != NULL) { + return ret; + } + } + + /* nop of 2 has more specializations */ + if (nop == 2) { + /* Encode the zero/contiguous strides */ + int code; + code = (fixed_strides[0] == 0) ? 0 : + (fixed_strides[0] == itemsize) ? 2*2*1 : 8; + code += (fixed_strides[1] == 0) ? 0 : + (fixed_strides[1] == itemsize) ? 2*1 : 8; + code += (fixed_strides[2] == 0) ? 0 : + (fixed_strides[2] == itemsize) ? 1 : 8; + if (code >= 2 && code < 7) { + sum_of_products_fn ret = + _binary_specialization_table[type_num][code-2]; + if (ret != NULL) { + return ret; + } + } + } + + /* Inner loop with an output stride of 0 */ + if (fixed_strides[nop] == 0) { + return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0]; + } + + /* Check for all contiguous */ + for (iop = 0; iop < nop + 1; ++iop) { + if (fixed_strides[iop] != itemsize) { + break; + } + } + + /* Contiguous loop */ + if (iop == nop + 1) { + return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0]; + } + + /* None of the above specializations caught it, general loops */ + return _unspecialized_table[type_num][nop <= 3 ? nop : 0]; +} diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h new file mode 100644 index 000000000..c6cf18ec6 --- /dev/null +++ b/numpy/core/src/multiarray/einsum_sumprod.h @@ -0,0 +1,12 @@ +#ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H +#define _NPY_MULTIARRAY_EINSUM_SUMPROD_H + +#include <numpy/npy_common.h> + +typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp); + +NPY_VISIBILITY_HIDDEN sum_of_products_fn +get_sum_of_products_function(int nop, int type_num, + npy_intp itemsize, npy_intp const *fixed_strides); + +#endif diff --git a/numpy/core/src/multiarray/flagsobject.c b/numpy/core/src/multiarray/flagsobject.c index bec0523d5..9b7d8deae 100644 --- a/numpy/core/src/multiarray/flagsobject.c +++ b/numpy/core/src/multiarray/flagsobject.c @@ -711,7 +711,7 @@ arrayflags_print(PyArrayFlagsObject *self) if (fl & NPY_ARRAY_WARN_ON_WRITE) { _warn_on_write = " (with WARN_ON_WRITE=True)"; } - return PyUString_FromFormat( + return PyUnicode_FromFormat( " %s : %s\n %s : %s\n" " %s : %s\n %s : %s%s\n" " %s : %s\n %s : %s\n" diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c index c8533539b..3575d6fad 100644 --- a/numpy/core/src/multiarray/getset.c +++ b/numpy/core/src/multiarray/getset.c @@ -217,7 +217,7 @@ array_protocol_descr_get(PyArrayObject *self) if (dobj == NULL) { return NULL; } - PyTuple_SET_ITEM(dobj, 0, PyString_FromString("")); + PyTuple_SET_ITEM(dobj, 0, PyUnicode_FromString("")); PyTuple_SET_ITEM(dobj, 1, array_typestr_get(self)); res = PyList_New(1); if (res == NULL) { @@ -621,7 +621,6 @@ static PyObject * array_struct_get(PyArrayObject *self) { PyArrayInterface *inter; - PyObject *ret; inter = (PyArrayInterface *)PyArray_malloc(sizeof(PyArrayInterface)); if (inter==NULL) { @@ -673,8 +672,14 @@ array_struct_get(PyArrayObject *self) else { inter->descr = NULL; } + PyObject *ret = PyCapsule_New(inter, NULL, gentype_struct_free); + if (ret == NULL) { + return NULL; + } Py_INCREF(self); - ret = NpyCapsule_FromVoidPtrAndDesc(inter, self, gentype_struct_free); + if (PyCapsule_SetContext(ret, self) < 0) { + return NULL; + } return ret; } diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c index db15ff1d5..c0cea0f21 100644 --- a/numpy/core/src/multiarray/mapping.c +++ b/numpy/core/src/multiarray/mapping.c @@ -1418,7 +1418,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view) return 0; } else if (tup == NULL){ - PyObject *errmsg = PyUString_FromString("no field of name "); + PyObject *errmsg = PyUnicode_FromString("no field of name "); PyUString_Concat(&errmsg, ind); PyErr_SetObject(PyExc_ValueError, errmsg); Py_DECREF(errmsg); @@ -2438,7 +2438,7 @@ mapiter_fill_info(PyArrayMapIterObject *mit, npy_index_info *indices, * Attempt to set a meaningful exception. Could also find out * if a boolean index was converted. */ - errmsg = PyUString_FromString("shape mismatch: indexing arrays could not " + errmsg = PyUnicode_FromString("shape mismatch: indexing arrays could not " "be broadcast together with shapes "); if (errmsg == NULL) { return -1; @@ -3183,7 +3183,7 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type, goto finish; broadcast_error: - errmsg = PyUString_FromString("shape mismatch: value array " + errmsg = PyUnicode_FromString("shape mismatch: value array " "of shape "); if (errmsg == NULL) { goto finish; @@ -3204,7 +3204,7 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type, goto finish; } - tmp = PyUString_FromString("could not be broadcast to indexing " + tmp = PyUnicode_FromString("could not be broadcast to indexing " "result of shape "); PyUString_ConcatAndDel(&errmsg, tmp); if (errmsg == NULL) { diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c index 0519434e8..ae2dceb10 100644 --- a/numpy/core/src/multiarray/methods.c +++ b/numpy/core/src/multiarray/methods.c @@ -2585,9 +2585,10 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args)) PyArrayObject *arr; PyArray_Descr *dtype; PyObject *c; + if (PyArray_SIZE(self) != 1) { - PyErr_SetString(PyExc_TypeError, "only length-1 arrays can "\ - "be converted to Python scalars"); + PyErr_SetString(PyExc_TypeError, + "only length-1 arrays can be converted to Python scalars"); return NULL; } @@ -2598,38 +2599,18 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args)) if (!PyArray_CanCastArrayTo(self, dtype, NPY_SAME_KIND_CASTING) && !(PyArray_TYPE(self) == NPY_OBJECT)) { - PyObject *err, *msg_part; + PyObject *descr = (PyObject*)PyArray_DESCR(self); + Py_DECREF(dtype); - err = PyString_FromString("unable to convert "); - if (err == NULL) { - return NULL; - } - msg_part = PyObject_Repr((PyObject*)PyArray_DESCR(self)); - if (msg_part == NULL) { - Py_DECREF(err); - return NULL; - } - PyString_ConcatAndDel(&err, msg_part); - if (err == NULL) { - return NULL; - } - msg_part = PyString_FromString(", to complex."); - if (msg_part == NULL) { - Py_DECREF(err); - return NULL; - } - PyString_ConcatAndDel(&err, msg_part); - if (err == NULL) { - return NULL; - } - PyErr_SetObject(PyExc_TypeError, err); - Py_DECREF(err); + PyErr_Format(PyExc_TypeError, + "Unable to convert %R to complex", descr); return NULL; } if (PyArray_TYPE(self) == NPY_OBJECT) { /* let python try calling __complex__ on the object. */ PyObject *args, *res; + Py_DECREF(dtype); args = Py_BuildValue("(O)", *((PyObject**)PyArray_DATA(self))); if (args == NULL) { diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 923469edf..db419636d 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -4335,18 +4335,18 @@ NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis2 = NULL; static int intern_strings(void) { - npy_ma_str_array = PyUString_InternFromString("__array__"); - npy_ma_str_array_prepare = PyUString_InternFromString("__array_prepare__"); - npy_ma_str_array_wrap = PyUString_InternFromString("__array_wrap__"); - npy_ma_str_array_finalize = PyUString_InternFromString("__array_finalize__"); - npy_ma_str_ufunc = PyUString_InternFromString("__array_ufunc__"); - npy_ma_str_implementation = PyUString_InternFromString("_implementation"); - npy_ma_str_order = PyUString_InternFromString("order"); - npy_ma_str_copy = PyUString_InternFromString("copy"); - npy_ma_str_dtype = PyUString_InternFromString("dtype"); - npy_ma_str_ndmin = PyUString_InternFromString("ndmin"); - npy_ma_str_axis1 = PyUString_InternFromString("axis1"); - npy_ma_str_axis2 = PyUString_InternFromString("axis2"); + npy_ma_str_array = PyUnicode_InternFromString("__array__"); + npy_ma_str_array_prepare = PyUnicode_InternFromString("__array_prepare__"); + npy_ma_str_array_wrap = PyUnicode_InternFromString("__array_wrap__"); + npy_ma_str_array_finalize = PyUnicode_InternFromString("__array_finalize__"); + npy_ma_str_ufunc = PyUnicode_InternFromString("__array_ufunc__"); + npy_ma_str_implementation = PyUnicode_InternFromString("_implementation"); + npy_ma_str_order = PyUnicode_InternFromString("order"); + npy_ma_str_copy = PyUnicode_InternFromString("copy"); + npy_ma_str_dtype = PyUnicode_InternFromString("dtype"); + npy_ma_str_ndmin = PyUnicode_InternFromString("ndmin"); + npy_ma_str_axis1 = PyUnicode_InternFromString("axis1"); + npy_ma_str_axis2 = PyUnicode_InternFromString("axis2"); return npy_ma_str_array && npy_ma_str_array_prepare && npy_ma_str_array_wrap && npy_ma_str_array_finalize && @@ -4477,14 +4477,14 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) { goto err; } - c_api = NpyCapsule_FromVoidPtr((void *)PyArray_API, NULL); + c_api = PyCapsule_New((void *)PyArray_API, NULL, NULL); if (c_api == NULL) { goto err; } PyDict_SetItemString(d, "_ARRAY_API", c_api); Py_DECREF(c_api); - c_api = NpyCapsule_FromVoidPtr((void *)PyUFunc_API, NULL); + c_api = PyCapsule_New((void *)PyUFunc_API, NULL, NULL); if (c_api == NULL) { goto err; } @@ -4506,7 +4506,7 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) { PyDict_SetItemString(d, "tracemalloc_domain", s); Py_DECREF(s); - s = PyUString_FromString("3.1"); + s = PyUnicode_FromString("3.1"); PyDict_SetItemString(d, "__version__", s); Py_DECREF(s); @@ -4540,7 +4540,7 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) { } Py_DECREF(s); - s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL); + s = PyCapsule_New((void *)_datetime_strings, NULL, NULL); if (s == NULL) { goto err; } diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c index a0dda4090..4bc6d2ca1 100644 --- a/numpy/core/src/multiarray/nditer_constr.c +++ b/numpy/core/src/multiarray/nditer_constr.c @@ -1755,7 +1755,7 @@ broadcast_error: { char *tmpstr; if (op_axes == NULL) { - errmsg = PyUString_FromString("operands could not be broadcast " + errmsg = PyUnicode_FromString("operands could not be broadcast " "together with shapes "); if (errmsg == NULL) { return 0; @@ -1776,7 +1776,7 @@ broadcast_error: { } } if (itershape != NULL) { - tmp = PyUString_FromString("and requested shape "); + tmp = PyUnicode_FromString("and requested shape "); if (tmp == NULL) { Py_DECREF(errmsg); return 0; @@ -1801,7 +1801,7 @@ broadcast_error: { Py_DECREF(errmsg); } else { - errmsg = PyUString_FromString("operands could not be broadcast " + errmsg = PyUnicode_FromString("operands could not be broadcast " "together with remapped shapes " "[original->remapped]: "); for (iop = 0; iop < nop; ++iop) { @@ -1843,7 +1843,7 @@ broadcast_error: { } } if (itershape != NULL) { - tmp = PyUString_FromString("and requested shape "); + tmp = PyUnicode_FromString("and requested shape "); if (tmp == NULL) { Py_DECREF(errmsg); return 0; @@ -1877,11 +1877,11 @@ operand_different_than_broadcast: { /* Start of error message */ if (op_flags[iop] & NPY_ITER_READONLY) { - errmsg = PyUString_FromString("non-broadcastable operand " + errmsg = PyUnicode_FromString("non-broadcastable operand " "with shape "); } else { - errmsg = PyUString_FromString("non-broadcastable output " + errmsg = PyUnicode_FromString("non-broadcastable output " "operand with shape "); } if (errmsg == NULL) { @@ -1913,7 +1913,7 @@ operand_different_than_broadcast: { } } - tmp = PyUString_FromString(" [remapped to "); + tmp = PyUnicode_FromString(" [remapped to "); if (tmp == NULL) { return 0; } @@ -1932,7 +1932,7 @@ operand_different_than_broadcast: { } } - tmp = PyUString_FromString(" doesn't match the broadcast shape "); + tmp = PyUnicode_FromString(" doesn't match the broadcast shape "); if (tmp == NULL) { return 0; } diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c index e271906c1..5b4836cc9 100644 --- a/numpy/core/src/multiarray/nditer_pywrap.c +++ b/numpy/core/src/multiarray/nditer_pywrap.c @@ -1142,7 +1142,7 @@ npyiter_dealloc(NewNpyArrayIterObject *self) "results.", 1) < 0) { PyObject *s; - s = PyUString_FromString("npyiter_dealloc"); + s = PyUnicode_FromString("npyiter_dealloc"); if (s) { PyErr_WriteUnraisable(s); Py_DECREF(s); diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src index 58b9e2c30..c1bff1e42 100644 --- a/numpy/core/src/multiarray/scalartypes.c.src +++ b/numpy/core/src/multiarray/scalartypes.c.src @@ -447,7 +447,7 @@ _void_to_hex(const char* argbuf, const Py_ssize_t arglen, } memcpy(&retbuf[j], echars, strlen(echars)); - retval = PyUString_FromStringAndSize(retbuf, slen); + retval = PyUnicode_FromStringAndSize(retbuf, slen); PyMem_Free(retbuf); return retval; @@ -518,21 +518,21 @@ datetimetype_repr(PyObject *self) */ if ((scal->obmeta.num == 1 && scal->obmeta.base != NPY_FR_h) || scal->obmeta.base == NPY_FR_GENERIC) { - ret = PyUString_FromString("numpy.datetime64('"); + ret = PyUnicode_FromString("numpy.datetime64('"); PyUString_ConcatAndDel(&ret, - PyUString_FromString(iso)); + PyUnicode_FromString(iso)); PyUString_ConcatAndDel(&ret, - PyUString_FromString("')")); + PyUnicode_FromString("')")); } else { - ret = PyUString_FromString("numpy.datetime64('"); + ret = PyUnicode_FromString("numpy.datetime64('"); PyUString_ConcatAndDel(&ret, - PyUString_FromString(iso)); + PyUnicode_FromString(iso)); PyUString_ConcatAndDel(&ret, - PyUString_FromString("','")); + PyUnicode_FromString("','")); ret = append_metastr_to_string(&scal->obmeta, 1, ret); PyUString_ConcatAndDel(&ret, - PyUString_FromString("')")); + PyUnicode_FromString("')")); } return ret; @@ -554,31 +554,31 @@ timedeltatype_repr(PyObject *self) /* The value */ if (scal->obval == NPY_DATETIME_NAT) { - ret = PyUString_FromString("numpy.timedelta64('NaT'"); + ret = PyUnicode_FromString("numpy.timedelta64('NaT'"); } else { /* * Can't use "%lld" if HAVE_LONG_LONG is not defined */ #if defined(HAVE_LONG_LONG) - ret = PyUString_FromFormat("numpy.timedelta64(%lld", + ret = PyUnicode_FromFormat("numpy.timedelta64(%lld", (long long)scal->obval); #else - ret = PyUString_FromFormat("numpy.timedelta64(%ld", + ret = PyUnicode_FromFormat("numpy.timedelta64(%ld", (long)scal->obval); #endif } /* The metadata unit */ if (scal->obmeta.base == NPY_FR_GENERIC) { PyUString_ConcatAndDel(&ret, - PyUString_FromString(")")); + PyUnicode_FromString(")")); } else { PyUString_ConcatAndDel(&ret, - PyUString_FromString(",'")); + PyUnicode_FromString(",'")); ret = append_metastr_to_string(&scal->obmeta, 1, ret); PyUString_ConcatAndDel(&ret, - PyUString_FromString("')")); + PyUnicode_FromString("')")); } return ret; @@ -611,7 +611,7 @@ datetimetype_str(PyObject *self) return NULL; } - return PyUString_FromString(iso); + return PyUnicode_FromString(iso); } static char *_datetime_verbose_strings[NPY_DATETIME_NUMUNITS] = { @@ -657,21 +657,21 @@ timedeltatype_str(PyObject *self) } if (scal->obval == NPY_DATETIME_NAT) { - ret = PyUString_FromString("NaT"); + ret = PyUnicode_FromString("NaT"); } else { /* * Can't use "%lld" if HAVE_LONG_LONG is not defined */ #if defined(HAVE_LONG_LONG) - ret = PyUString_FromFormat("%lld ", + ret = PyUnicode_FromFormat("%lld ", (long long)(scal->obval * scal->obmeta.num)); #else - ret = PyUString_FromFormat("%ld ", + ret = PyUnicode_FromFormat("%ld ", (long)(scal->obval * scal->obmeta.num)); #endif PyUString_ConcatAndDel(&ret, - PyUString_FromString(basestr)); + PyUnicode_FromString(basestr)); } return ret; @@ -795,7 +795,7 @@ legacy_@name@_format@kind@(@type@ val) PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im); } - return PyUString_FromString(buf); + return PyUnicode_FromString(buf); } #undef _FMT1 @@ -836,7 +836,7 @@ legacy_@name@_format@kind@(npy_@name@ val){ strcpy(&buf[cnt],".0"); } - return PyUString_FromString(buf); + return PyUnicode_FromString(buf); } #undef _FMT1 @@ -904,7 +904,7 @@ c@name@type_@kind@(PyObject *self) return NULL; } - PyUString_ConcatAndDel(&istr, PyUString_FromString("j")); + PyUString_ConcatAndDel(&istr, PyUnicode_FromString("j")); return istr; } @@ -915,13 +915,13 @@ c@name@type_@kind@(PyObject *self) } } else if (npy_isnan(val.real)) { - rstr = PyUString_FromString("nan"); + rstr = PyUnicode_FromString("nan"); } else if (val.real > 0){ - rstr = PyUString_FromString("inf"); + rstr = PyUnicode_FromString("inf"); } else { - rstr = PyUString_FromString("-inf"); + rstr = PyUnicode_FromString("-inf"); } if (npy_isfinite(val.imag)) { @@ -931,19 +931,19 @@ c@name@type_@kind@(PyObject *self) } } else if (npy_isnan(val.imag)) { - istr = PyUString_FromString("+nan"); + istr = PyUnicode_FromString("+nan"); } else if (val.imag > 0){ - istr = PyUString_FromString("+inf"); + istr = PyUnicode_FromString("+inf"); } else { - istr = PyUString_FromString("-inf"); + istr = PyUnicode_FromString("-inf"); } - ret = PyUString_FromString("("); + ret = PyUnicode_FromString("("); PyUString_ConcatAndDel(&ret, rstr); PyUString_ConcatAndDel(&ret, istr); - PyUString_ConcatAndDel(&ret, PyUString_FromString("j)")); + PyUString_ConcatAndDel(&ret, PyUnicode_FromString("j)")); return ret; } @@ -1147,12 +1147,16 @@ gentype_sizeof(PyObject *self) NPY_NO_EXPORT void gentype_struct_free(PyObject *ptr) { - PyArrayInterface *arrif; - PyObject *context; - - arrif = (PyArrayInterface*)PyCapsule_GetPointer(ptr, NULL); - context = (PyObject *)PyCapsule_GetContext(ptr); - Py_DECREF(context); + PyArrayInterface *arrif = (PyArrayInterface*)PyCapsule_GetPointer(ptr, NULL); + if (arrif == NULL) { + PyErr_WriteUnraisable(ptr); + return; + } + PyObject *context = (PyObject *)PyCapsule_GetContext(ptr); + if (context == NULL && PyErr_Occurred()) { + PyErr_WriteUnraisable(ptr); + } + Py_XDECREF(context); Py_XDECREF(arrif->descr); PyArray_free(arrif->shape); PyArray_free(arrif); diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c index 73bb7933f..397d539c1 100644 --- a/numpy/core/src/multiarray/shape.c +++ b/numpy/core/src/multiarray/shape.c @@ -458,7 +458,7 @@ _attempt_nocopy_reshape(PyArrayObject *self, int newnd, const npy_intp *newdims, static void raise_reshape_size_mismatch(PyArray_Dims *newshape, PyArrayObject *arr) { - PyObject *msg = PyUString_FromFormat("cannot reshape array of size %zd " + PyObject *msg = PyUnicode_FromFormat("cannot reshape array of size %zd " "into shape ", PyArray_SIZE(arr)); PyObject *tmp = convert_shape_to_string(newshape->len, newshape->ptr, ""); @@ -997,10 +997,10 @@ build_shape_string(npy_intp n, npy_intp const *vals) } if (i == n) { - return PyUString_FromFormat("()"); + return PyUnicode_FromFormat("()"); } else { - ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]); + ret = PyUnicode_FromFormat("(%" NPY_INTP_FMT, vals[i++]); if (ret == NULL) { return NULL; } @@ -1008,10 +1008,10 @@ build_shape_string(npy_intp n, npy_intp const *vals) for (; i < n; ++i) { if (vals[i] < 0) { - tmp = PyUString_FromString(",newaxis"); + tmp = PyUnicode_FromString(",newaxis"); } else { - tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]); + tmp = PyUnicode_FromFormat(",%" NPY_INTP_FMT, vals[i]); } if (tmp == NULL) { Py_DECREF(ret); @@ -1024,7 +1024,7 @@ build_shape_string(npy_intp n, npy_intp const *vals) } } - tmp = PyUString_FromFormat(")"); + tmp = PyUnicode_FromFormat(")"); PyUString_ConcatAndDel(&ret, tmp); return ret; } diff --git a/numpy/core/src/multiarray/strfuncs.c b/numpy/core/src/multiarray/strfuncs.c index 363cbdba2..d9d9b7c0a 100644 --- a/numpy/core/src/multiarray/strfuncs.c +++ b/numpy/core/src/multiarray/strfuncs.c @@ -3,14 +3,25 @@ #include <Python.h> #include <numpy/arrayobject.h> - #include "npy_pycompat.h" - +#include "npy_import.h" #include "strfuncs.h" static PyObject *PyArray_StrFunction = NULL; static PyObject *PyArray_ReprFunction = NULL; + +static void +npy_PyErr_SetStringChained(PyObject *type, const char *message) +{ + PyObject *exc, *val, *tb; + + PyErr_Fetch(&exc, &val, &tb); + PyErr_SetString(type, message); + npy_PyErr_ChainExceptionsCause(exc, val, tb); +} + + /*NUMPY_API * Set the array print function to be a Python function. */ @@ -36,164 +47,52 @@ PyArray_SetStringFunction(PyObject *op, int repr) } -/* - * Extend string. On failure, returns NULL and leaves *strp alone. - * XXX we do this in multiple places; time for a string library? - */ -static char * -extend_str(char **strp, Py_ssize_t n, Py_ssize_t *maxp) -{ - char *str = *strp; - Py_ssize_t new_cap; - - if (n >= *maxp - 16) { - new_cap = *maxp * 2; - - if (new_cap <= *maxp) { /* overflow */ - return NULL; - } - str = PyArray_realloc(*strp, new_cap); - if (str != NULL) { - *strp = str; - *maxp = new_cap; - } - } - return str; -} - - -static int -dump_data(char **string, Py_ssize_t *n, Py_ssize_t *max_n, char *data, int nd, - npy_intp const *dimensions, npy_intp const *strides, PyArrayObject* self) -{ - PyObject *op = NULL, *sp = NULL; - char *ostring; - npy_intp i, N, ret = 0; - -#define CHECK_MEMORY do { \ - if (extend_str(string, *n, max_n) == NULL) { \ - ret = -1; \ - goto end; \ - } \ - } while (0) - - if (nd == 0) { - if ((op = PyArray_GETITEM(self, data)) == NULL) { - return -1; - } - sp = PyObject_Repr(op); - if (sp == NULL) { - ret = -1; - goto end; - } - ostring = PyString_AsString(sp); - N = PyString_Size(sp)*sizeof(char); - *n += N; - CHECK_MEMORY; - memmove(*string + (*n - N), ostring, N); - } - else { - CHECK_MEMORY; - (*string)[*n] = '['; - *n += 1; - for (i = 0; i < dimensions[0]; i++) { - if (dump_data(string, n, max_n, - data + (*strides)*i, - nd - 1, dimensions + 1, - strides + 1, self) < 0) { - return -1; - } - CHECK_MEMORY; - if (i < dimensions[0] - 1) { - (*string)[*n] = ','; - (*string)[*n+1] = ' '; - *n += 2; - } - } - CHECK_MEMORY; - (*string)[*n] = ']'; - *n += 1; - } - -#undef CHECK_MEMORY - -end: - Py_XDECREF(op); - Py_XDECREF(sp); - return ret; -} - - -static PyObject * -array_repr_builtin(PyArrayObject *self, int repr) -{ - PyObject *ret; - char *string; - /* max_n initial value is arbitrary, dump_data will extend it */ - Py_ssize_t n = 0, max_n = PyArray_NBYTES(self) * 4 + 7; - - if ((string = PyArray_malloc(max_n)) == NULL) { - return PyErr_NoMemory(); - } - - if (dump_data(&string, &n, &max_n, PyArray_DATA(self), - PyArray_NDIM(self), PyArray_DIMS(self), - PyArray_STRIDES(self), self) < 0) { - PyArray_free(string); - return NULL; - } - - if (repr) { - if (PyArray_ISEXTENDED(self)) { - ret = PyUString_FromFormat("array(%s, '%c%d')", - string, - PyArray_DESCR(self)->type, - PyArray_DESCR(self)->elsize); - } - else { - ret = PyUString_FromFormat("array(%s, '%c')", - string, - PyArray_DESCR(self)->type); - } - } - else { - ret = PyUString_FromStringAndSize(string, n); - } - - PyArray_free(string); - return ret; -} - - NPY_NO_EXPORT PyObject * array_repr(PyArrayObject *self) { - PyObject *s; + static PyObject *repr = NULL; - if (PyArray_ReprFunction == NULL) { - s = array_repr_builtin(self, 1); + if (PyArray_ReprFunction != NULL) { + return PyObject_CallFunctionObjArgs(PyArray_ReprFunction, self, NULL); } - else { - s = PyObject_CallFunctionObjArgs(PyArray_ReprFunction, self, NULL); + + /* + * We need to do a delayed import here as initialization on module load + * leads to circular import problems. + */ + npy_cache_import("numpy.core.arrayprint", "_default_array_repr", &repr); + if (repr == NULL) { + npy_PyErr_SetStringChained(PyExc_RuntimeError, + "Unable to configure default ndarray.__repr__"); + return NULL; } - return s; + return PyObject_CallFunctionObjArgs(repr, self, NULL); } NPY_NO_EXPORT PyObject * array_str(PyArrayObject *self) { - PyObject *s; + static PyObject *str = NULL; - if (PyArray_StrFunction == NULL) { - s = array_repr_builtin(self, 0); + if (PyArray_StrFunction != NULL) { + return PyObject_CallFunctionObjArgs(PyArray_StrFunction, self, NULL); } - else { - s = PyObject_CallFunctionObjArgs(PyArray_StrFunction, self, NULL); + + /* + * We need to do a delayed import here as initialization on module load leads + * to circular import problems. + */ + npy_cache_import("numpy.core.arrayprint", "_default_array_str", &str); + if (str == NULL) { + npy_PyErr_SetStringChained(PyExc_RuntimeError, + "Unable to configure default ndarray.__str__"); + return NULL; } - return s; + return PyObject_CallFunctionObjArgs(str, self, NULL); } + NPY_NO_EXPORT PyObject * array_format(PyArrayObject *self, PyObject *args) { @@ -221,4 +120,3 @@ array_format(PyArrayObject *self, PyObject *args) ); } } - diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c index b97f0f8b8..6b6c6bd9d 100644 --- a/numpy/core/src/multiarray/usertypes.c +++ b/numpy/core/src/multiarray/usertypes.c @@ -272,7 +272,7 @@ PyArray_RegisterCastFunc(PyArray_Descr *descr, int totype, if (PyErr_Occurred()) { return -1; } - cobj = NpyCapsule_FromVoidPtr((void *)castfunc, NULL); + cobj = PyCapsule_New((void *)castfunc, NULL, NULL); if (cobj == NULL) { Py_DECREF(key); return -1; diff --git a/numpy/core/src/umath/_rational_tests.c.src b/numpy/core/src/umath/_rational_tests.c.src index 13e33d0a5..cbb6d9d17 100644 --- a/numpy/core/src/umath/_rational_tests.c.src +++ b/numpy/core/src/umath/_rational_tests.c.src @@ -526,11 +526,11 @@ static PyObject* pyrational_repr(PyObject* self) { rational x = ((PyRational*)self)->r; if (d(x)!=1) { - return PyUString_FromFormat( + return PyUnicode_FromFormat( "rational(%ld,%ld)",(long)x.n,(long)d(x)); } else { - return PyUString_FromFormat( + return PyUnicode_FromFormat( "rational(%ld)",(long)x.n); } } @@ -539,11 +539,11 @@ static PyObject* pyrational_str(PyObject* self) { rational x = ((PyRational*)self)->r; if (d(x)!=1) { - return PyUString_FromFormat( + return PyUnicode_FromFormat( "%ld/%ld",(long)x.n,(long)d(x)); } else { - return PyUString_FromFormat( + return PyUnicode_FromFormat( "%ld",(long)x.n); } } @@ -1126,7 +1126,7 @@ PyMODINIT_FUNC PyInit__rational_tests(void) { if (PyErr_Occurred()) { goto fail; } - numpy_str = PyUString_FromString("numpy"); + numpy_str = PyUnicode_FromString("numpy"); if (!numpy_str) { goto fail; } diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src index d08aabd64..932c3b5ab 100644 --- a/numpy/core/src/umath/_umath_tests.c.src +++ b/numpy/core/src/umath/_umath_tests.c.src @@ -671,7 +671,7 @@ PyMODINIT_FUNC PyInit__umath_tests(void) { d = PyModule_GetDict(m); - version = PyString_FromString("0.1"); + version = PyUnicode_FromString("0.1"); PyDict_SetItemString(d, "__version__", version); Py_DECREF(version); diff --git a/numpy/core/src/umath/extobj.c b/numpy/core/src/umath/extobj.c index 4a953410a..cd81f7734 100644 --- a/numpy/core/src/umath/extobj.c +++ b/numpy/core/src/umath/extobj.c @@ -109,7 +109,7 @@ _error_handler(int method, PyObject *errobj, char *errtype, int retstatus, int * errtype, name); goto fail; } - args = Py_BuildValue("NN", PyUString_FromString(errtype), + args = Py_BuildValue("NN", PyUnicode_FromString(errtype), PyLong_FromLong((long) retstatus)); if (args == NULL) { goto fail; diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c index bf6e5a698..a0090e302 100644 --- a/numpy/core/src/umath/override.c +++ b/numpy/core/src/umath/override.c @@ -605,7 +605,7 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method, goto fail; } - method_name = PyUString_FromString(method); + method_name = PyUnicode_FromString(method); if (method_name == NULL) { goto fail; } diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c index 005556fb6..f693eb5c2 100644 --- a/numpy/core/src/umath/ufunc_object.c +++ b/numpy/core/src/umath/ufunc_object.c @@ -3318,7 +3318,6 @@ get_binary_op_function(PyUFuncObject *ufunc, int *otype, void **out_innerloopdata) { int i; - PyUFunc_Loop1d *funcdata; NPY_UF_DBG_PRINT1("Getting binary op function for type number %d\n", *otype); @@ -3336,7 +3335,10 @@ get_binary_op_function(PyUFuncObject *ufunc, int *otype, return -1; } else if (obj != NULL) { - funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj); + PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL); + if (funcdata == NULL) { + return -1; + } while (funcdata != NULL) { int *types = funcdata->arg_types; @@ -5190,9 +5192,12 @@ PyUFunc_RegisterLoopForDescr(PyUFuncObject *ufunc, result = -1; } else { - PyUFunc_Loop1d *current; int cmp = 1; - current = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(cobj); + PyUFunc_Loop1d *current = PyCapsule_GetPointer(cobj, NULL); + if (current == NULL) { + result = -1; + goto done; + } while (current != NULL) { cmp = cmp_arg_types(current->arg_types, arg_typenums, ufunc->nargs); @@ -5226,6 +5231,7 @@ PyUFunc_RegisterLoopForDescr(PyUFuncObject *ufunc, } } +done: PyArray_free(arg_typenums); Py_DECREF(key); @@ -5294,7 +5300,7 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc, } /* If it's not there, then make one and return. */ else if (cobj == NULL) { - cobj = NpyCapsule_FromVoidPtr((void *)funcdata, _loop1d_list_free); + cobj = PyCapsule_New((void *)funcdata, NULL, _loop1d_list_free); if (cobj == NULL) { goto fail; } @@ -5312,7 +5318,10 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc, * is exactly like this one, then just replace. * Otherwise insert. */ - current = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(cobj); + current = PyCapsule_GetPointer(cobj, NULL); + if (current == NULL) { + goto fail; + } while (current != NULL) { cmp = cmp_arg_types(current->arg_types, newtypes, ufunc->nargs); if (cmp >= 0) { @@ -5383,7 +5392,7 @@ ufunc_dealloc(PyUFuncObject *ufunc) static PyObject * ufunc_repr(PyUFuncObject *ufunc) { - return PyUString_FromFormat("<ufunc '%s'>", ufunc->name); + return PyUnicode_FromFormat("<ufunc '%s'>", ufunc->name); } static int @@ -5995,7 +6004,7 @@ ufunc_get_doc(PyUFuncObject *ufunc) } if (ufunc->doc != NULL) { PyUString_ConcatAndDel(&doc, - PyUString_FromFormat("\n\n%s", ufunc->doc)); + PyUnicode_FromFormat("\n\n%s", ufunc->doc)); } return doc; } @@ -6051,7 +6060,7 @@ ufunc_get_types(PyUFuncObject *ufunc) t[ni + 2 + j] = _typecharfromnum(ufunc->types[n]); n++; } - str = PyUString_FromStringAndSize(t, no + ni + 2); + str = PyUnicode_FromStringAndSize(t, no + ni + 2); PyList_SET_ITEM(list, k, str); } PyArray_free(t); @@ -6061,7 +6070,7 @@ ufunc_get_types(PyUFuncObject *ufunc) static PyObject * ufunc_get_name(PyUFuncObject *ufunc) { - return PyUString_FromString(ufunc->name); + return PyUnicode_FromString(ufunc->name); } static PyObject * @@ -6077,7 +6086,7 @@ ufunc_get_signature(PyUFuncObject *ufunc) if (!ufunc->core_enabled) { Py_RETURN_NONE; } - return PyUString_FromString(ufunc->core_signature); + return PyUnicode_FromString(ufunc->core_signature); } #undef _typecharfromnum diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c index fec3caef2..aa6f34d59 100644 --- a/numpy/core/src/umath/ufunc_type_resolution.c +++ b/numpy/core/src/umath/ufunc_type_resolution.c @@ -36,15 +36,15 @@ npy_casting_to_py_object(NPY_CASTING casting) { switch (casting) { case NPY_NO_CASTING: - return PyUString_FromString("no"); + return PyUnicode_FromString("no"); case NPY_EQUIV_CASTING: - return PyUString_FromString("equiv"); + return PyUnicode_FromString("equiv"); case NPY_SAFE_CASTING: - return PyUString_FromString("safe"); + return PyUnicode_FromString("safe"); case NPY_SAME_KIND_CASTING: - return PyUString_FromString("same_kind"); + return PyUnicode_FromString("same_kind"); case NPY_UNSAFE_CASTING: - return PyUString_FromString("unsafe"); + return PyUnicode_FromString("unsafe"); default: return PyLong_FromLong(casting); } @@ -1336,7 +1336,6 @@ find_userloop(PyUFuncObject *ufunc, void **out_innerloopdata) { npy_intp i, nin = ufunc->nin, j, nargs = nin + ufunc->nout; - PyUFunc_Loop1d *funcdata; /* Use this to try to avoid repeating the same userdef loop search */ int last_userdef = -1; @@ -1368,9 +1367,11 @@ find_userloop(PyUFuncObject *ufunc, else if (obj == NULL) { continue; } - for (funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj); - funcdata != NULL; - funcdata = funcdata->next) { + PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL); + if (funcdata == NULL) { + return -1; + } + for (; funcdata != NULL; funcdata = funcdata->next) { int *types = funcdata->arg_types; for (j = 0; j < nargs; ++j) { @@ -1744,7 +1745,6 @@ linear_search_userloop_type_resolver(PyUFuncObject *self, char *out_err_dst_typecode) { npy_intp i, nop = self->nin + self->nout; - PyUFunc_Loop1d *funcdata; /* Use this to try to avoid repeating the same userdef loop search */ int last_userdef = -1; @@ -1776,9 +1776,11 @@ linear_search_userloop_type_resolver(PyUFuncObject *self, else if (obj == NULL) { continue; } - for (funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj); - funcdata != NULL; - funcdata = funcdata->next) { + PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL); + if (funcdata == NULL) { + return -1; + } + for (; funcdata != NULL; funcdata = funcdata->next) { int *types = funcdata->arg_types; switch (ufunc_loop_matches(self, op, input_casting, output_casting, @@ -1816,7 +1818,6 @@ type_tuple_userloop_type_resolver(PyUFuncObject *self, PyArray_Descr **out_dtype) { int i, j, nin = self->nin, nop = nin + self->nout; - PyUFunc_Loop1d *funcdata; /* Use this to try to avoid repeating the same userdef loop search */ int last_userdef = -1; @@ -1844,9 +1845,11 @@ type_tuple_userloop_type_resolver(PyUFuncObject *self, continue; } - for (funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj); - funcdata != NULL; - funcdata = funcdata->next) { + PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL); + if (funcdata == NULL) { + return -1; + } + for (; funcdata != NULL; funcdata = funcdata->next) { int *types = funcdata->arg_types; int matched = 1; diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c index 708a27ad0..ba7ac1706 100644 --- a/numpy/core/src/umath/umathmodule.c +++ b/numpy/core/src/umath/umathmodule.c @@ -237,23 +237,23 @@ NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_pyvals_name = NULL; static int intern_strings(void) { - if (!(npy_um_str_out = PyUString_InternFromString("out"))) return -1; - if (!(npy_um_str_where = PyUString_InternFromString("where"))) return -1; - if (!(npy_um_str_axes = PyUString_InternFromString("axes"))) return -1; - if (!(npy_um_str_axis = PyUString_InternFromString("axis"))) return -1; - if (!(npy_um_str_keepdims = PyUString_InternFromString("keepdims"))) return -1; - if (!(npy_um_str_casting = PyUString_InternFromString("casting"))) return -1; - if (!(npy_um_str_order = PyUString_InternFromString("order"))) return -1; - if (!(npy_um_str_dtype = PyUString_InternFromString("dtype"))) return -1; - if (!(npy_um_str_subok = PyUString_InternFromString("subok"))) return -1; - if (!(npy_um_str_signature = PyUString_InternFromString("signature"))) return -1; - if (!(npy_um_str_sig = PyUString_InternFromString("sig"))) return -1; - if (!(npy_um_str_extobj = PyUString_InternFromString("extobj"))) return -1; - if (!(npy_um_str_array_prepare = PyUString_InternFromString("__array_prepare__"))) return -1; - if (!(npy_um_str_array_wrap = PyUString_InternFromString("__array_wrap__"))) return -1; - if (!(npy_um_str_array_finalize = PyUString_InternFromString("__array_finalize__"))) return -1; - if (!(npy_um_str_ufunc = PyUString_InternFromString("__array_ufunc__"))) return -1; - if (!(npy_um_str_pyvals_name = PyUString_InternFromString(UFUNC_PYVALS_NAME))) return -1; + if (!(npy_um_str_out = PyUnicode_InternFromString("out"))) return -1; + if (!(npy_um_str_where = PyUnicode_InternFromString("where"))) return -1; + if (!(npy_um_str_axes = PyUnicode_InternFromString("axes"))) return -1; + if (!(npy_um_str_axis = PyUnicode_InternFromString("axis"))) return -1; + if (!(npy_um_str_keepdims = PyUnicode_InternFromString("keepdims"))) return -1; + if (!(npy_um_str_casting = PyUnicode_InternFromString("casting"))) return -1; + if (!(npy_um_str_order = PyUnicode_InternFromString("order"))) return -1; + if (!(npy_um_str_dtype = PyUnicode_InternFromString("dtype"))) return -1; + if (!(npy_um_str_subok = PyUnicode_InternFromString("subok"))) return -1; + if (!(npy_um_str_signature = PyUnicode_InternFromString("signature"))) return -1; + if (!(npy_um_str_sig = PyUnicode_InternFromString("sig"))) return -1; + if (!(npy_um_str_extobj = PyUnicode_InternFromString("extobj"))) return -1; + if (!(npy_um_str_array_prepare = PyUnicode_InternFromString("__array_prepare__"))) return -1; + if (!(npy_um_str_array_wrap = PyUnicode_InternFromString("__array_wrap__"))) return -1; + if (!(npy_um_str_array_finalize = PyUnicode_InternFromString("__array_finalize__"))) return -1; + if (!(npy_um_str_ufunc = PyUnicode_InternFromString("__array_ufunc__"))) return -1; + if (!(npy_um_str_pyvals_name = PyUnicode_InternFromString(UFUNC_PYVALS_NAME))) return -1; return 0; } diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py index 51cf7039f..a97198076 100644 --- a/numpy/core/tests/test_regression.py +++ b/numpy/core/tests/test_regression.py @@ -14,7 +14,7 @@ from numpy.testing import ( assert_raises_regex, assert_warns, suppress_warnings, _assert_valid_refcount, HAS_REFCOUNT, ) -from numpy.testing._private.utils import _no_tracing +from numpy.testing._private.utils import _no_tracing, requires_memory from numpy.compat import asbytes, asunicode, pickle try: @@ -2488,3 +2488,29 @@ class TestRegression: assert arr.size * arr.itemsize > 2 ** 31 c_arr = np.ctypeslib.as_ctypes(arr) assert_equal(c_arr._length_, arr.size) + + def test_complex_conversion_error(self): + # gh-17068 + with pytest.raises(TypeError, match=r"Unable to convert dtype.*"): + complex(np.array("now", np.datetime64)) + + def test__array_interface__descr(self): + # gh-17068 + dt = np.dtype(dict(names=['a', 'b'], + offsets=[0, 0], + formats=[np.int64, np.int64])) + descr = np.array((1, 1), dtype=dt).__array_interface__['descr'] + assert descr == [('', '|V8')] # instead of [(b'', '|V8')] + + @pytest.mark.skipif(sys.maxsize < 2 ** 31 + 1, reason='overflows 32-bit python') + @requires_memory(free_bytes=9e9) + def test_dot_big_stride(self): + # gh-17111 + # blas stride = stride//itemsize > int32 max + int32_max = np.iinfo(np.int32).max + n = int32_max + 3 + a = np.empty([n], dtype=np.float32) + b = a[::n-1] + b[...] = 1 + assert b.strides[0] > int32_max * b.dtype.itemsize + assert np.dot(b, b) == 2.0 |
