diff options
-rw-r--r-- | numpy/core/code_generators/numpy_api.py | 19 | ||||
-rw-r--r-- | numpy/core/src/multiarray/einsum.c.src | 1253 | ||||
-rw-r--r-- | numpy/core/src/multiarray/new_iterator.c.src | 122 | ||||
-rw-r--r-- | numpy/core/tests/test_numeric.py | 119 |
4 files changed, 1408 insertions, 105 deletions
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py index b943f17de..4620766cc 100644 --- a/numpy/core/code_generators/numpy_api.py +++ b/numpy/core/code_generators/numpy_api.py @@ -292,16 +292,17 @@ multiarray_funcs_api = { 'NpyIter_GetWriteFlags': 258, 'NpyIter_DebugPrint': 259, 'NpyIter_IterationNeedsAPI': 260, + 'NpyIter_GetInnerFixedStrideArray': 261, # - 'PyArray_CastingConverter': 261, - 'PyArray_CountNonzero': 262, - 'PyArray_PromoteTypes': 263, - 'PyArray_MinScalarType': 264, - 'PyArray_ResultType': 265, - 'PyArray_CanCastArrayTo': 266, - 'PyArray_CanCastTypeTo': 267, - 'PyArray_EinsteinSum': 268, - 'PyArray_FillWithZero': 269, + 'PyArray_CastingConverter': 262, + 'PyArray_CountNonzero': 263, + 'PyArray_PromoteTypes': 264, + 'PyArray_MinScalarType': 265, + 'PyArray_ResultType': 266, + 'PyArray_CanCastArrayTo': 267, + 'PyArray_CanCastTypeTo': 268, + 'PyArray_EinsteinSum': 269, + 'PyArray_FillWithZero': 270, } ufunc_types_api = { diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index 4ebd3aa82..d25c6af90 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -18,6 +18,32 @@ #include <ctype.h> +#ifdef __SSE__ +#define EINSUM_USE_SSE1 1 +#else +#define EINSUM_USE_SSE1 0 +#endif + +/* + * TODO: Only SSE for float32 is implemented in the loops, + * no SSE2 for float64 + */ +#ifdef __SSE2__ +#define EINSUM_USE_SSE2 0 +#else +#define EINSUM_USE_SSE2 0 +#endif + +#if EINSUM_USE_SSE1 +#include <xmmintrin.h> +#endif + +#if EINSUM_USE_SSE2 +#include <emmintrin.h> +#endif + +#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0) + typedef enum { BROADCAST_LEFT, BROADCAST_RIGHT, @@ -45,6 +71,10 @@ typedef enum { * 0*5, * 0*4, * 1*3# + * #float32 = 0*5, + * 0*5, + * 0,1,0,0, + * 0*3# */ /**begin repeat1 @@ -55,29 +85,46 @@ static void @name@_sum_of_products_@noplabel@(int nop, char **dataptr, npy_intp *strides, npy_intp count) { +#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) && !@complex@ + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) && !@complex@ + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif +#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) + char *data_out = dataptr[@nop@]; + npy_intp stride_out = strides[@nop@]; +#endif + while (count--) { #if !@complex@ # if @nop@ == 1 - *(npy_@name@ *)dataptr[1] = @to@(@from@(*(npy_@name@ *)dataptr[0]) + - @from@(*(npy_@name@ *)dataptr[1])); - dataptr[0] += strides[0]; - dataptr[1] += strides[1]; + *(npy_@name@ *)data_out = @to@(@from@(*(npy_@name@ *)data0) + + @from@(*(npy_@name@ *)data_out)); + data0 += stride0; + data_out += stride_out; # elif @nop@ == 2 - *(npy_@name@ *)dataptr[2] = @to@(@from@(*(npy_@name@ *)dataptr[0]) * - @from@(*(npy_@name@ *)dataptr[1]) + - @from@(*(npy_@name@ *)dataptr[2])); - dataptr[0] += strides[0]; - dataptr[1] += strides[1]; - dataptr[2] += strides[2]; + *(npy_@name@ *)data_out = @to@(@from@(*(npy_@name@ *)data0) * + @from@(*(npy_@name@ *)data1) + + @from@(*(npy_@name@ *)data_out)); + data0 += stride0; + data1 += stride1; + data_out += stride_out; # elif @nop@ == 3 - *(npy_@name@ *)dataptr[3] = @to@(@from@(*(npy_@name@ *)dataptr[0]) * - @from@(*(npy_@name@ *)dataptr[1]) * - @from@(*(npy_@name@ *)dataptr[2]) + - @from@(*(npy_@name@ *)dataptr[3])); - dataptr[0] += strides[0]; - dataptr[1] += strides[1]; - dataptr[2] += strides[2]; - dataptr[3] += strides[3]; + *(npy_@name@ *)data_out = @to@(@from@(*(npy_@name@ *)data0) * + @from@(*(npy_@name@ *)data1) * + @from@(*(npy_@name@ *)data2) + + @from@(*(npy_@name@ *)data_out)); + data0 += stride0; + data1 += stride1; + data2 += stride2; + data_out += stride_out; # else npy_@temp@ temp = @from@(*(npy_@name@ *)dataptr[0]); int i; @@ -92,12 +139,12 @@ static void # endif #else /* complex */ # if @nop@ == 1 - ((npy_@temp@ *)dataptr[1])[0] = ((npy_@temp@ *)dataptr[0])[0] + - ((npy_@temp@ *)dataptr[1])[0]; - ((npy_@temp@ *)dataptr[1])[1] = ((npy_@temp@ *)dataptr[0])[1] + - ((npy_@temp@ *)dataptr[1])[1]; - dataptr[0] += strides[0]; - dataptr[1] += strides[1]; + ((npy_@temp@ *)data_out)[0] = ((npy_@temp@ *)data0)[0] + + ((npy_@temp@ *)data_out)[0]; + ((npy_@temp@ *)data_out)[1] = ((npy_@temp@ *)data0)[1] + + ((npy_@temp@ *)data_out)[1]; + data0 += stride0; + data_out += stride_out; # else # if @nop@ <= 3 #define _SUMPROD_NOP @nop@ @@ -108,7 +155,7 @@ static void re = ((npy_@temp@ *)dataptr[0])[0]; im = ((npy_@temp@ *)dataptr[0])[1]; int i; - for (i = 1; i <= _SUMPROD_NOP; ++i) { + for (i = 1; i < _SUMPROD_NOP; ++i) { tmp = re * ((npy_@temp@ *)dataptr[i])[0] - im * ((npy_@temp@ *)dataptr[i])[1]; im = re * ((npy_@temp@ *)dataptr[i])[1] + @@ -129,6 +176,723 @@ static void } } +#if @nop@ == 1 + +static void +@name@_sum_of_products_contig_one(int nop, char **dataptr, + npy_intp *NPY_UNUSED(strides), npy_intp count) +{ + npy_@name@ *data0 = (npy_@name@ *)dataptr[0]; + npy_@name@ *data_out = (npy_@name@ *)dataptr[1]; + + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ +#if !@complex@ + data_out[@i@] = @to@(@from@(data0[@i@]) + + @from@(data_out[@i@])); +#else /* complex */ + ((npy_@temp@ *)data_out + 2*@i@)[0] = + ((npy_@temp@ *)data0 + 2*@i@)[0] + + ((npy_@temp@ *)data_out + 2*@i@)[0]; + ((npy_@temp@ *)data_out + 2*@i@)[1] = + ((npy_@temp@ *)data0 + 2*@i@)[1] + + ((npy_@temp@ *)data_out + 2*@i@)[1]; +#endif + data0 += 16; + data_out += 16; +/**end repeat2**/ + } + + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + return; + } +#if !@complex@ + data_out[@i@] = @to@(@from@(data0[@i@]) + + @from@(data_out[@i@])); +#else + ((npy_@temp@ *)data_out + 2*@i@)[0] = + ((npy_@temp@ *)data0 + 2*@i@)[0] + + ((npy_@temp@ *)data_out + 2*@i@)[0]; + ((npy_@temp@ *)data_out + 2*@i@)[1] = + ((npy_@temp@ *)data0 + 2*@i@)[1] + + ((npy_@temp@ *)data_out + 2*@i@)[1]; +#endif +/**end repeat2**/ +} + +#elif @nop@ == 2 && !@complex@ + +static void +@name@_sum_of_products_contig_two(int nop, char **dataptr, + npy_intp *NPY_UNUSED(strides), npy_intp count) +{ + npy_@name@ *data0 = (npy_@name@ *)dataptr[0]; + npy_@name@ *data1 = (npy_@name@ *)dataptr[1]; + npy_@name@ *data_out = (npy_@name@ *)dataptr[2]; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, b; +#endif + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && + EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); + b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); + _mm_store_ps(data_out+@i@, b); +/**end repeat2**/ + data0 += 16; + data1 += 16; + data_out += 16; + } + } +#endif + + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); + b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); + _mm_storeu_ps(data_out+@i@, b); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ +#endif + data0 += 16; + data1 += 16; + data_out += 16; + } + + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + return; + } + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ +} + +/* Some extra specializations for the two operand case */ +static void +@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr, + npy_intp *NPY_UNUSED(strides), npy_intp count) +{ + npy_@temp@ value0 = @from@(*(npy_@name@ *)dataptr[0]); + npy_@name@ *data1 = (npy_@name@ *)dataptr[1]; + npy_@name@ *data_out = (npy_@name@ *)dataptr[2]; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, b, value0_sse; + + value0_sse = _mm_set_ps1(value0); +#endif + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@)); + b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); + _mm_store_ps(data_out+@i@, b); +/**end repeat2**/ + data1 += 16; + data_out += 16; + } + } +#endif + + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@)); + b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); + _mm_storeu_ps(data_out+@i@, b); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + data_out[@i@] = @to@(value0 * + @from@(data1[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ +#endif + data1 += 16; + data_out += 16; + } + + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + return; + } + data_out[@i@] = @to@(value0 * + @from@(data1[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ +} + +static void +@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr, + npy_intp *NPY_UNUSED(strides), npy_intp count) +{ + npy_@name@ *data0 = (npy_@name@ *)dataptr[0]; + npy_@temp@ value1 = @from@(*(npy_@name@ *)dataptr[1]); + npy_@name@ *data_out = (npy_@name@ *)dataptr[2]; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, b, value1_sse; + + value1_sse = _mm_set_ps1(value1); +#endif + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse); + b = _mm_add_ps(a, _mm_load_ps(data_out+@i@)); + _mm_store_ps(data_out+@i@, b); +/**end repeat2**/ + data0 += 16; + data_out += 16; + } + } +#endif + + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse); + b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); + _mm_storeu_ps(data_out+@i@, b); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + data_out[@i@] = @to@(@from@(data0[@i@])* + value1 + + @from@(data_out[@i@])); +/**end repeat2**/ +#endif + data0 += 16; + data_out += 16; + } + + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + return; + } + data_out[@i@] = @to@(@from@(data0[@i@])* + value1 + + @from@(data_out[@i@])); +/**end repeat2**/ +} + +static void +@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr, + npy_intp *NPY_UNUSED(strides), npy_intp count) +{ + npy_@name@ *data0 = (npy_@name@ *)dataptr[0]; + npy_@name@ *data1 = (npy_@name@ *)dataptr[1]; + npy_@temp@ accum = 0; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, accum_sse = _mm_setzero_ps(); +#endif + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); + accum_sse = _mm_add_ps(accum_sse, a); +/**end repeat2**/ + data0 += 16; + data1 += 16; + } + } +#endif + + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); + accum_sse = _mm_add_ps(accum_sse, a); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + accum += @from@(data0[@i@]) * @from@(data1[@i@]); +/**end repeat2**/ +#endif + data0 += 16; + data1 += 16; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); +#endif + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + *(npy_@name@ *)dataptr[2] += @to@(accum); + return; + } + accum += @from@(data0[@i@]) * @from@(data1[@i@]); +/**end repeat2**/ + + *(npy_@name@ *)dataptr[2] += @to@(accum); +} + +static void +@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr, + npy_intp *NPY_UNUSED(strides), npy_intp count) +{ + npy_@temp@ value0 = @from@(*(npy_@name@ *)dataptr[0]); + npy_@name@ *data1 = (npy_@name@ *)dataptr[1]; + npy_@temp@ accum = 0; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, accum_sse = _mm_setzero_ps(); +#endif + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data1)) { + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@)); +/**end repeat2**/ + data1 += 16; + } + } +#endif + + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + accum += @from@(data1[@i@]); +/**end repeat2**/ +#endif + data1 += 16; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); +#endif + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + *(npy_@name@ *)dataptr[2] += @to@(value0 * accum); + return; + } + accum += @from@(data1[@i@]); +/**end repeat2**/ + + *(npy_@name@ *)dataptr[2] += @to@(value0 * accum); +} + +static void +@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr, + npy_intp *NPY_UNUSED(strides), npy_intp count) +{ + npy_@name@ *data0 = (npy_@name@ *)dataptr[0]; + npy_@temp@ value1 = @from@(*(npy_@name@ *)dataptr[1]); + npy_@temp@ accum = 0; + +#if EINSUM_USE_SSE1 && @float32@ + __m128 a, accum_sse = _mm_setzero_ps(); +#endif + +#if EINSUM_USE_SSE1 && @float32@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0)) { + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); +/**end repeat2**/ + data0 += 16; + } + } +#endif + + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +#if EINSUM_USE_SSE1 && @float32@ +/**begin repeat2 + * #i = 0, 4, 8, 12# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); +/**end repeat2**/ +#else +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + accum += @from@(data0[@i@]); +/**end repeat2**/ +#endif + data0 += 16; + } + +#if EINSUM_USE_SSE1 && @float32@ + /* Add the four SSE values and put in accum */ + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); + accum_sse = _mm_add_ps(a, accum_sse); + a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); + accum_sse = _mm_add_ps(a, accum_sse); + _mm_store_ss(&accum, accum_sse); +#endif + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + *(npy_@name@ *)dataptr[2] += @to@(accum * value1); + return; + } + accum += @from@(data0[@i@]); +/**end repeat2**/ + + *(npy_@name@ *)dataptr[2] += @to@(accum * value1); +} + +#elif @nop@ == 3 && !@complex@ + +static void +@name@_sum_of_products_contig_three(int nop, char **dataptr, + npy_intp *NPY_UNUSED(strides), npy_intp count) +{ + npy_@name@ *data0 = (npy_@name@ *)dataptr[0]; + npy_@name@ *data1 = (npy_@name@ *)dataptr[1]; + npy_@name@ *data2 = (npy_@name@ *)dataptr[2]; + npy_@name@ *data_out = (npy_@name@ *)dataptr[3]; + + /* Unroll the loop by 16 */ + while (count >= 16) { + count -= 16; + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) * + @from@(data2[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ + data0 += 16; + data1 += 16; + data_out += 16; + } + + /* Finish off the loop */ + +/**begin repeat2 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + return; + } + data_out[@i@] = @to@(@from@(data0[@i@]) * + @from@(data1[@i@]) * + @from@(data2[@i@]) + + @from@(data_out[@i@])); +/**end repeat2**/ +} + +#else + +static void +@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr, + npy_intp *NPY_UNUSED(strides), npy_intp count) +{ + while (count--) { +#if !@complex@ + npy_@temp@ temp = @from@(*(npy_@name@ *)dataptr[0]); + int i; + for (i = 1; i < nop; ++i) { + temp *= @from@(*(npy_@name@ *)dataptr[i]); + } + *(npy_@name@ *)dataptr[nop] = @to@(temp + + @from@(*(npy_@name@ *)dataptr[i])); + for (i = 0; i <= nop; ++i) { + dataptr[i] += sizeof(npy_@name@); + } +#else /* complex */ +# if @nop@ <= 3 +# define _SUMPROD_NOP @nop@ +# else +# define _SUMPROD_NOP nop +# endif + npy_@temp@ re, im, tmp; + re = ((npy_@temp@ *)dataptr[0])[0]; + im = ((npy_@temp@ *)dataptr[0])[1]; + int i; + for (i = 1; i < _SUMPROD_NOP; ++i) { + tmp = re * ((npy_@temp@ *)dataptr[i])[0] - + im * ((npy_@temp@ *)dataptr[i])[1]; + im = re * ((npy_@temp@ *)dataptr[i])[1] + + im * ((npy_@temp@ *)dataptr[i])[0]; + re = tmp; + } + ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[0] = re + + ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[0]; + ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[1] = im + + ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[1]; + + for (i = 0; i <= _SUMPROD_NOP; ++i) { + dataptr[i] += sizeof(npy_@name@); + } +# undef _SUMPROD_NOP +#endif + } +} + +#endif + +static void +@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, + npy_intp *strides, npy_intp count) +{ +#if @complex@ + npy_@temp@ accum_re = 0, accum_im = 0; +#else + npy_@temp@ accum = 0; +#endif + +#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) && !@complex@ + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) && !@complex@ + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif + + while (count--) { +#if !@complex@ +# if @nop@ == 1 + accum += @from@(*(npy_@name@ *)data0); + data0 += stride0; +# elif @nop@ == 2 + accum += @from@(*(npy_@name@ *)data0) * + @from@(*(npy_@name@ *)data1); + data0 += stride0; + data1 += stride1; +# elif @nop@ == 3 + accum += @from@(*(npy_@name@ *)data0) * + @from@(*(npy_@name@ *)data1) * + @from@(*(npy_@name@ *)data2); + data0 += stride0; + data1 += stride1; + data2 += stride2; +# else + npy_@temp@ temp = @from@(*(npy_@name@ *)dataptr[0]); + int i; + for (i = 1; i < nop; ++i) { + temp *= @from@(*(npy_@name@ *)dataptr[i]); + } + accum += temp; + for (i = 0; i < nop; ++i) { + dataptr[i] += strides[i]; + } +# endif +#else /* complex */ +# if @nop@ == 1 + accum_re += ((npy_@temp@ *)data0)[0]; + accum_im += ((npy_@temp@ *)data0)[1]; + data0 += stride0; +# else +# if @nop@ <= 3 +#define _SUMPROD_NOP @nop@ +# else +#define _SUMPROD_NOP nop +# endif + npy_@temp@ re, im, tmp; + re = ((npy_@temp@ *)dataptr[0])[0]; + im = ((npy_@temp@ *)dataptr[0])[1]; + int i; + for (i = 1; i < _SUMPROD_NOP; ++i) { + tmp = re * ((npy_@temp@ *)dataptr[i])[0] - + im * ((npy_@temp@ *)dataptr[i])[1]; + im = re * ((npy_@temp@ *)dataptr[i])[1] + + im * ((npy_@temp@ *)dataptr[i])[0]; + re = tmp; + } + accum_re += re; + accum_im += im; + for (i = 0; i < _SUMPROD_NOP; ++i) { + dataptr[i] += strides[i]; + } +#undef _SUMPROD_NOP +# endif +#endif + } + +#if @complex@ +# if @nop@ <= 3 + ((npy_@temp@ *)dataptr[@nop@])[0] += accum_re; + ((npy_@temp@ *)dataptr[@nop@])[1] += accum_im; +# else + ((npy_@temp@ *)dataptr[nop])[0] += accum_re; + ((npy_@temp@ *)dataptr[nop])[1] += accum_im; +# endif +#else +# if @nop@ <= 3 + *((npy_@name@ *)dataptr[@nop@]) = @to@(accum + + @from@(*((npy_@name@ *)dataptr[@nop@]))); +# else + *((npy_@name@ *)dataptr[nop]) = @to@(accum + + @from@(*((npy_@name@ *)dataptr[nop]))); +# endif +#endif + +} + /**end repeat1**/ /**end repeat**/ @@ -145,28 +909,45 @@ static void bool_sum_of_products_@noplabel@(int nop, char **dataptr, npy_intp *strides, npy_intp count) { +#if (@nop@ <= 3) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif +#if (@nop@ <= 3) + char *data_out = dataptr[@nop@]; + npy_intp stride_out = strides[@nop@]; +#endif + while (count--) { #if @nop@ == 1 - *(npy_bool *)dataptr[1] = *(npy_bool *)dataptr[0] || - *(npy_bool *)dataptr[1]; - dataptr[0] += strides[0]; - dataptr[1] += strides[1]; + *(npy_bool *)data_out = *(npy_bool *)data0 || + *(npy_bool *)data_out; + data0 += stride0; + data_out += stride_out; #elif @nop@ == 2 - *(npy_bool *)dataptr[2] = (*(npy_bool *)dataptr[0] && - *(npy_bool *)dataptr[1]) || - *(npy_bool *)dataptr[2]; - dataptr[0] += strides[0]; - dataptr[1] += strides[1]; - dataptr[2] += strides[2]; + *(npy_bool *)data_out = (*(npy_bool *)data0 && + *(npy_bool *)data1) || + *(npy_bool *)data_out; + data0 += stride0; + data1 += stride1; + data_out += stride_out; #elif @nop@ == 3 - *(npy_bool *)dataptr[3] = (*(npy_bool *)dataptr[0] && - *(npy_bool *)dataptr[1] && - *(npy_bool *)dataptr[2]) || - *(npy_bool *)dataptr[3]; - dataptr[0] += strides[0]; - dataptr[1] += strides[1]; - dataptr[2] += strides[2]; - dataptr[3] += strides[3]; + *(npy_bool *)data_out = (*(npy_bool *)data0 && + *(npy_bool *)data1 && + *(npy_bool *)data2) || + *(npy_bool *)data_out; + data0 += stride0; + data1 += stride1; + data2 += stride2; + data_out += stride_out; #else npy_bool temp = *(npy_bool *)dataptr[0]; int i; @@ -181,13 +962,377 @@ bool_sum_of_products_@noplabel@(int nop, char **dataptr, } } +static void +bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr, + npy_intp *strides, npy_intp count) +{ +#if (@nop@ <= 3) + char *data0 = dataptr[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) + char *data1 = dataptr[1]; +#endif +#if (@nop@ == 3) + char *data2 = dataptr[2]; +#endif +#if (@nop@ <= 3) + char *data_out = dataptr[@nop@]; +#endif + +/* Unroll the loop by 16 for fixed-size nop */ +#if (@nop@ <= 3) + while (count >= 16) { + count -= 16; +#else + while (count--) { +#endif + +# if @nop@ == 1 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 16*sizeof(npy_bool); + data_out += 16*sizeof(npy_bool); +# elif @nop@ == 2 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + *((npy_bool *)data_out + @i@) = + ((*((npy_bool *)data0 + @i@)) && + (*((npy_bool *)data1 + @i@))) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 16*sizeof(npy_bool); + data1 += 16*sizeof(npy_bool); + data_out += 16*sizeof(npy_bool); +# elif @nop@ == 3 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + *((npy_bool *)data_out + @i@) = + ((*((npy_bool *)data0 + @i@)) && + (*((npy_bool *)data1 + @i@)) && + (*((npy_bool *)data2 + @i@))) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 16*sizeof(npy_bool); + data1 += 16*sizeof(npy_bool); + data2 += 16*sizeof(npy_bool); + data_out += 16*sizeof(npy_bool); +# else + npy_bool temp = *(npy_bool *)dataptr[0]; + int i; + for (i = 1; i < nop; ++i) { + temp = temp && *(npy_bool *)dataptr[i]; + } + *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i]; + for (i = 0; i <= nop; ++i) { + dataptr[i] += sizeof(npy_bool); + } +# endif + } + + /* If the loop was unrolled, we need to finish it off */ +#if (@nop@ <= 3) +# if @nop@ == 1 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + return; + } + *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 16*sizeof(npy_bool); + data_out += 16*sizeof(npy_bool); +# elif @nop@ == 2 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + return; + } + *((npy_bool *)data_out + @i@) = + ((*((npy_bool *)data0 + @i@)) && + (*((npy_bool *)data1 + @i@))) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 16*sizeof(npy_bool); + data1 += 16*sizeof(npy_bool); + data_out += 16*sizeof(npy_bool); +# elif @nop@ == 3 +/**begin repeat1 + * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + if (count-- == 0) { + return; + } + *((npy_bool *)data_out + @i@) = + ((*((npy_bool *)data0 + @i@)) && + (*((npy_bool *)data1 + @i@)) && + (*((npy_bool *)data2 + @i@))) || + (*((npy_bool *)data_out + @i@)); +/**end repeat1**/ + data0 += 16*sizeof(npy_bool); + data1 += 16*sizeof(npy_bool); + data2 += 16*sizeof(npy_bool); + data_out += 16*sizeof(npy_bool); +# endif +#endif +} + +static void +bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr, + npy_intp *strides, npy_intp count) +{ + npy_bool accum = 0; + +#if (@nop@ <= 3) + char *data0 = dataptr[0]; + npy_intp stride0 = strides[0]; +#endif +#if (@nop@ == 2 || @nop@ == 3) + char *data1 = dataptr[1]; + npy_intp stride1 = strides[1]; +#endif +#if (@nop@ == 3) + char *data2 = dataptr[2]; + npy_intp stride2 = strides[2]; +#endif + + while (count--) { +#if @nop@ == 1 + accum = *(npy_bool *)data0 || accum; + data0 += stride0; +#elif @nop@ == 2 + accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum; + data0 += stride0; + data1 += stride1; +#elif @nop@ == 3 + accum = (*(npy_bool *)data0 && + *(npy_bool *)data1 && + *(npy_bool *)data2) || accum; + data0 += stride0; + data1 += stride1; + data2 += stride2; +#else + npy_bool temp = *(npy_bool *)dataptr[0]; + int i; + for (i = 1; i < nop; ++i) { + temp = temp && *(npy_bool *)dataptr[i]; + } + accum = temp || accum; + for (i = 0; i <= nop; ++i) { + dataptr[i] += strides[i]; + } +#endif + } + +# if @nop@ <= 3 + *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]); +# else + *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]); +# endif +} + /**end repeat**/ typedef void (*sum_of_products_fn)(int, char **, npy_intp *, npy_intp); static sum_of_products_fn -get_sum_of_products_function(int nop, int type_num) +get_sum_of_products_function(int nop, int type_num, + npy_intp itemsize, npy_intp *fixed_strides) { + int iop; + + /* nop of 2 has more specializations */ + if (nop == 2) { + if (fixed_strides[0] == itemsize) { + if (fixed_strides[1] == itemsize) { + if (fixed_strides[2] == itemsize) { + /* contig, contig, contig */ + switch (type_num) { +/**begin repeat + * #name = byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble# + * #NAME = BYTE, SHORT, INT, LONG, LONGLONG, + * UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE# + */ + case NPY_@NAME@: + return &@name@_sum_of_products_contig_two; +/**end repeat**/ + } + } + else if (fixed_strides[2] == 0) { + /* contig, contig, stride0 */ + switch (type_num) { +/**begin repeat + * #name = byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble# + * #NAME = BYTE, SHORT, INT, LONG, LONGLONG, + * UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE# + */ + case NPY_@NAME@: + return &@name@_sum_of_products_contig_contig_outstride0_two; +/**end repeat**/ + } + } + } + else if (fixed_strides[1] == 0) { + if (fixed_strides[2] == itemsize) { + /* contig, stride0, contig */ + switch (type_num) { +/**begin repeat + * #name = byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble# + * #NAME = BYTE, SHORT, INT, LONG, LONGLONG, + * UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE# + */ + case NPY_@NAME@: + return &@name@_sum_of_products_contig_stride0_outcontig_two; +/**end repeat**/ + } + } + else if (fixed_strides[2] == 0) { + /* contig, stride0, stride0 */ + switch (type_num) { +/**begin repeat + * #name = byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble# + * #NAME = BYTE, SHORT, INT, LONG, LONGLONG, + * UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE# + */ + case NPY_@NAME@: + return &@name@_sum_of_products_contig_stride0_outstride0_two; +/**end repeat**/ + } + } + } + } + else if (fixed_strides[0] == 0) { + if (fixed_strides[1] == itemsize) { + if (fixed_strides[2] == itemsize) { + /* stride0, contig, contig */ + switch (type_num) { +/**begin repeat + * #name = byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble# + * #NAME = BYTE, SHORT, INT, LONG, LONGLONG, + * UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE# + */ + case NPY_@NAME@: + return &@name@_sum_of_products_stride0_contig_outcontig_two; +/**end repeat**/ + } + } + else if (fixed_strides[2] == 0) { + /* stride0, contig, stride0 */ + switch (type_num) { +/**begin repeat + * #name = byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble# + * #NAME = BYTE, SHORT, INT, LONG, LONGLONG, + * UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE# + */ + case NPY_@NAME@: + return &@name@_sum_of_products_stride0_contig_outstride0_two; +/**end repeat**/ + } + } + } + } + } + + /* Inner loop with an output stride of 0 */ + if (fixed_strides[nop] == 0) { + switch (type_num) { +/**begin repeat + * #name = bool, + * byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble, + * cfloat, cdouble, clongdouble# + * #NAME = BOOL, + * BYTE, SHORT, INT, LONG, LONGLONG, + * UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE, + * CFLOAT, CDOUBLE, CLONGDOUBLE# + */ + case NPY_@NAME@: + switch (nop) { +/**begin repeat1 + * #nop = 1, 2, 3, 1000# + * #noplabel = one, two, three, any# + */ +#if @nop@ <= 3 + case @nop@: +#else + default: +#endif + return &@name@_sum_of_products_outstride0_@noplabel@; +/**end repeat1**/ + } +/**end repeat**/ + } + } + + /* Check for all contiguous */ + for (iop = 0; iop < nop; ++iop) { + if (fixed_strides[iop] != itemsize) { + break; + } + } + + /* Contiguous loop */ + if (iop == nop) { + switch (type_num) { +/**begin repeat + * #name = bool, + * byte, short, int, long, longlong, + * ubyte, ushort, uint, ulong, ulonglong, + * half, float, double, longdouble, + * cfloat, cdouble, clongdouble# + * #NAME = BOOL, + * BYTE, SHORT, INT, LONG, LONGLONG, + * UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * HALF, FLOAT, DOUBLE, LONGDOUBLE, + * CFLOAT, CDOUBLE, CLONGDOUBLE# + */ + case NPY_@NAME@: + switch (nop) { +/**begin repeat1 + * #nop = 1, 2, 3, 1000# + * #noplabel = one, two, three, any# + */ +#if @nop@ <= 3 + case @nop@: +#else + default: +#endif + return &@name@_sum_of_products_contig_@noplabel@; +/**end repeat1**/ + } +/**end repeat**/ + } + } + + /* Regular inner loop */ switch (type_num) { /**begin repeat * #name = bool, @@ -209,11 +1354,10 @@ get_sum_of_products_function(int nop, int type_num) */ #if @nop@ <= 3 case @nop@: - return &@name@_sum_of_products_@noplabel@; #else default: - return &@name@_sum_of_products_@noplabel@; #endif + return &@name@_sum_of_products_@noplabel@; /**end repeat1**/ } /**end repeat**/ @@ -913,6 +2057,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop, NpyIter *iter; sum_of_products_fn sop; + npy_intp fixed_strides[NPY_MAXARGS]; /* nop+1 (+1 is for the output) must fit in NPY_MAXARGS */ if (nop >= NPY_MAXARGS) { @@ -1156,7 +2301,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop, /* Allocate the iterator */ iter = NpyIter_MultiNew(nop+1, op, NPY_ITER_NO_INNER_ITERATION| - ((dtype == NULL) ? 0 : NPY_ITER_COMMON_DTYPE)| + ((dtype != NULL) ? 0 : NPY_ITER_COMMON_DTYPE)| NPY_ITER_BUFFERED| NPY_ITER_DELAY_BUFALLOC| NPY_ITER_GROWINNER| @@ -1176,8 +2321,20 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop, PyArray_FillWithZero(ret); NpyIter_Reset(iter, NULL); + NpyIter_GetInnerFixedStrideArray(iter, fixed_strides); sop = get_sum_of_products_function(nop, - NpyIter_GetDescrArray(iter)[0]->type_num); + NpyIter_GetDescrArray(iter)[0]->type_num, + NpyIter_GetDescrArray(iter)[0]->elsize, + fixed_strides); + + #if 0 + NpyIter_DebugPrint(iter); + printf("fixed strides:\n"); + for (iop = 0; iop <= nop; ++iop) { + printf("%ld ", fixed_strides[iop]); + } + printf("\n"); + #endif /* Finally, the main loop */ if (sop == NULL) { diff --git a/numpy/core/src/multiarray/new_iterator.c.src b/numpy/core/src/multiarray/new_iterator.c.src index a1464ff92..653a8e829 100644 --- a/numpy/core/src/multiarray/new_iterator.c.src +++ b/numpy/core/src/multiarray/new_iterator.c.src @@ -113,7 +113,7 @@ typedef struct NpyIter_BD NpyIter_BufferData; ((NPY_SIZEOF_INTP)*(niter+1)) #define NIT_BASEOFFSETS_SIZEOF(itflags, ndim, niter) \ ((NPY_SIZEOF_INTP)*(niter+1)) -#define NIT_OBJECTS_SIZEOF(itflags, ndim, niter) \ +#define NIT_OPERANDS_SIZEOF(itflags, ndim, niter) \ ((NPY_SIZEOF_INTP)*(niter)) #define NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter) \ (NPY_INTP_ALIGNED(niter)) @@ -132,12 +132,12 @@ typedef struct NpyIter_BD NpyIter_BufferData; #define NIT_BASEOFFSETS_OFFSET(itflags, ndim, niter) \ (NIT_RESETDATAPTR_OFFSET(itflags, ndim, niter) + \ NIT_RESETDATAPTR_SIZEOF(itflags, ndim, niter)) -#define NIT_OBJECTS_OFFSET(itflags, ndim, niter) \ +#define NIT_OPERANDS_OFFSET(itflags, ndim, niter) \ (NIT_BASEOFFSETS_OFFSET(itflags, ndim, niter) + \ NIT_BASEOFFSETS_SIZEOF(itflags, ndim, niter)) #define NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) \ - (NIT_OBJECTS_OFFSET(itflags, ndim, niter) + \ - NIT_OBJECTS_SIZEOF(itflags, ndim, niter)) + (NIT_OPERANDS_OFFSET(itflags, ndim, niter) + \ + NIT_OPERANDS_SIZEOF(itflags, ndim, niter)) #define NIT_BUFFERDATA_OFFSET(itflags, ndim, niter) \ (NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) + \ NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter)) @@ -168,8 +168,8 @@ typedef struct NpyIter_BD NpyIter_BufferData; &(iter)->iter_flexdata + NIT_RESETDATAPTR_OFFSET(itflags, ndim, niter))) #define NIT_BASEOFFSETS(iter) ((npy_intp *)( \ &(iter)->iter_flexdata + NIT_BASEOFFSETS_OFFSET(itflags, ndim, niter))) -#define NIT_OBJECTS(iter) ((PyArrayObject **)( \ - &(iter)->iter_flexdata + NIT_OBJECTS_OFFSET(itflags, ndim, niter))) +#define NIT_OPERANDS(iter) ((PyArrayObject **)( \ + &(iter)->iter_flexdata + NIT_OPERANDS_OFFSET(itflags, ndim, niter))) #define NIT_OPITFLAGS(iter) ( \ &(iter)->iter_flexdata + NIT_OPITFLAGS_OFFSET(itflags, ndim, niter)) #define NIT_BUFFERDATA(iter) ((NpyIter_BufferData *)( \ @@ -399,7 +399,7 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, NIT_ITERINDEX(iter) = 0; memset(NIT_BASEOFFSETS(iter), 0, (niter+1)*NPY_SIZEOF_INTP); - op = NIT_OBJECTS(iter); + op = NIT_OPERANDS(iter); op_dtype = NIT_DTYPES(iter); op_itflags = NIT_OPITFLAGS(iter); op_dataptr = NIT_RESETDATAPTR(iter); @@ -519,6 +519,9 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, PyArray_Descr *dtype; int only_inputs = !(flags&NPY_ITER_COMMON_DTYPE); + op = NIT_OPERANDS(iter); + op_dtype = NIT_DTYPES(iter); + dtype = npyiter_get_common_dtype(niter, op, op_itflags, op_dtype, op_request_dtypes, @@ -529,21 +532,21 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, return NULL; } if (flags&NPY_ITER_COMMON_DTYPE) { + NPY_IT_DBG_PRINTF("Iterator: Replacing all data types\n"); /* Replace all the data types */ for (iiter = 0; iiter < niter; ++iiter) { Py_XDECREF(op_dtype[iiter]); Py_INCREF(dtype); op_dtype[iiter] = dtype; - NIT_DTYPES(iter)[iiter] = dtype; } } else { + NPY_IT_DBG_PRINTF("Iterator: Setting unset output data types\n"); /* Replace the NULL data types */ for (iiter = 0; iiter < niter; ++iiter) { if (op_dtype[iiter] == NULL) { Py_INCREF(dtype); op_dtype[iiter] = dtype; - NIT_DTYPES(iter)[iiter] = dtype; } } } @@ -584,7 +587,7 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, */ itflags = NIT_ITFLAGS(iter); ndim = NIT_NDIM(iter); - op = NIT_OBJECTS(iter); + op = NIT_OPERANDS(iter); op_dtype = NIT_DTYPES(iter); op_itflags = NIT_OPITFLAGS(iter); op_dataptr = NIT_RESETDATAPTR(iter); @@ -697,7 +700,7 @@ NpyIter_Copy(NpyIter *iter) memcpy(newiter, iter, size); /* Take ownership of references to the operands and dtypes */ - objects = NIT_OBJECTS(newiter); + objects = NIT_OPERANDS(newiter); dtypes = NIT_DTYPES(newiter); for (iiter = 0; iiter < niter; ++iiter) { Py_INCREF(objects[iiter]); @@ -787,7 +790,7 @@ NpyIter_Deallocate(NpyIter *iter) npy_intp iiter, niter = NIT_NITER(iter); PyArray_Descr **dtype = NIT_DTYPES(iter); - PyArrayObject **object = NIT_OBJECTS(iter); + PyArrayObject **object = NIT_OPERANDS(iter); /* Deallocate any buffers and buffering data */ if (itflags&NPY_ITFLAG_BUFFER) { @@ -2018,7 +2021,7 @@ NpyIter_GetOperandArray(NpyIter *iter) npy_intp ndim = NIT_NDIM(iter); npy_intp niter = NIT_NITER(iter); - return NIT_OBJECTS(iter); + return NIT_OPERANDS(iter); } /*NUMPY_API @@ -2052,7 +2055,7 @@ NpyIter_GetIterView(NpyIter *iter, npy_intp i) return NULL; } - obj = NIT_OBJECTS(iter)[i]; + obj = NIT_OPERANDS(iter)[i]; dtype = PyArray_DESCR(obj); writeable = NIT_OPITFLAGS(iter)[i]&NPY_OP_ITFLAG_WRITE; dataptr = NIT_RESETDATAPTR(iter)[i]; @@ -2161,6 +2164,65 @@ NpyIter_GetInnerStrideArray(NpyIter *iter) } /*NUMPY_API + * Get an array of strides which are fixed. Any strides which may + * change during iteration receive the value NPY_MAX_INTP. Once + * the iterator is ready to iterate, call this to get the strides + * which will always be fixed in the inner loop, then choose optimized + * inner loop functions which take advantage of those fixed strides. + * + * This function may be safely called without holding the Python GIL. + */ +NPY_NO_EXPORT void +NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides) +{ + npy_uint32 itflags = NIT_ITFLAGS(iter); + npy_intp ndim = NIT_NDIM(iter); + npy_intp iiter, niter = NIT_NITER(iter); + + NpyIter_AxisData *axisdata = NIT_AXISDATA(iter); + + if (itflags&NPY_ITFLAG_BUFFER) { + NpyIter_BufferData *data = NIT_BUFFERDATA(iter); + char *op_itflags = NIT_OPITFLAGS(iter); + npy_intp stride, *strides = NBF_STRIDES(data), + *ad_strides = NAD_STRIDES(axisdata); + PyArray_Descr **dtypes = NIT_DTYPES(iter); + + for (iiter = 0; iiter < niter; ++iiter) { + stride = strides[iiter]; + /* Operands which are always/never buffered have fixed strides */ + if (op_itflags[iiter]& + (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_BUFNEVER)) { + out_strides[iiter] = stride; + } + /* Reductions in the inner loop have fixed strides */ + else if (stride == 0 && (itflags&NPY_ITFLAG_REDUCE)) { + out_strides[iiter] = stride; + } + /* + * Inner loop contiguous array means its stride won't change when + * switching between buffering and not buffering + */ + else if (ad_strides[iiter] == dtypes[iiter]->elsize) { + out_strides[iiter] = ad_strides[iiter]; + } + /* + * Otherwise the strides can change if the operand is sometimes + * buffered, sometimes not. + */ + else { + out_strides[iiter] = NPY_MAX_INTP; + } + } + } + else { + /* If there's no buffering, the strides are always fixed */ + memcpy(out_strides, NAD_STRIDES(axisdata), niter*NPY_SIZEOF_INTP); + } +} + + +/*NUMPY_API * Get a pointer to the size of the inner loop (when HasInnerLoop is false) * * This function may be safely called without holding the Python GIL. @@ -2685,6 +2747,20 @@ npyiter_check_casting(npy_intp niter, PyArrayObject **op, npy_intp iiter; for(iiter = 0; iiter < niter; ++iiter) { + NPY_IT_DBG_PRINTF("Iterator: Checking casting for operand %d\n", + (int)iiter); +#if NPY_IT_DBG_TRACING + printf("op: "); + if (op[iiter] != NULL) { + PyObject_Print((PyObject *)PyArray_DESCR(op[iiter]), stdout, 0); + } + else { + printf("<null>"); + } + printf(", iter: "); + PyObject_Print((PyObject *)op_dtype[iiter], stdout, 0); + printf("\n"); +#endif /* If the types aren't equivalent, a cast is necessary */ if (op[iiter] != NULL && !PyArray_EquivTypes(PyArray_DESCR(op[iiter]), op_dtype[iiter])) { @@ -2795,7 +2871,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags, char *odataptr; NpyIter_AxisData *axisdata0, *axisdata; npy_intp sizeof_axisdata; - PyArrayObject **op = NIT_OBJECTS(iter); + PyArrayObject **op = NIT_OPERANDS(iter); axisdata0 = NIT_AXISDATA(iter); sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter); @@ -3331,7 +3407,7 @@ npyiter_apply_forced_iteration_order(NpyIter *iter, NPY_ORDER order) NIT_ITFLAGS(iter) |= NPY_ITFLAG_FORCEDORDER; /* Only need to actually do something if there is more than 1 dim */ if (ndim > 1) { - PyArrayObject **op = NIT_OBJECTS(iter); + PyArrayObject **op = NIT_OPERANDS(iter); int forder = 1; /* Check that all the array inputs are fortran order */ @@ -3975,7 +4051,7 @@ npyiter_allocate_arrays(NpyIter *iter, npy_intp iiter, niter = NIT_NITER(iter); NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter); - PyArrayObject **op = NIT_OBJECTS(iter); + PyArrayObject **op = NIT_OPERANDS(iter); for (iiter = 0; iiter < niter; ++iiter) { if (op[iiter] == NULL) { @@ -4216,6 +4292,8 @@ npyiter_get_common_dtype(npy_intp niter, PyArrayObject **op, PyArrayObject *arrs[NPY_MAXARGS]; PyArray_Descr *dtypes[NPY_MAXARGS]; + NPY_IT_DBG_PRINTF("Iterator: Getting a common data type from operands\n"); + for (iiter = 0; iiter < niter; ++iiter) { if (op_dtype[iiter] != NULL && (!only_inputs || (op_itflags[iiter]&NPY_OP_ITFLAG_READ))) { @@ -4246,7 +4324,7 @@ npyiter_allocate_transfer_functions(NpyIter *iter) char *op_itflags = NIT_OPITFLAGS(iter); NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter); NpyIter_AxisData *axisdata = NIT_AXISDATA(iter); - PyArrayObject **op = NIT_OBJECTS(iter); + PyArrayObject **op = NIT_OPERANDS(iter); PyArray_Descr **op_dtype = NIT_DTYPES(iter); npy_intp *strides = NAD_STRIDES(axisdata), op_stride; PyArray_StridedTransferFn **readtransferfn = NBF_READTRANSFERFN(bufferdata), @@ -4614,7 +4692,7 @@ npyiter_copy_to_buffers(NpyIter *iter) NpyIter_AxisData *axisdata = NIT_AXISDATA(iter); PyArray_Descr **dtypes = NIT_DTYPES(iter); - PyArrayObject **operands = NIT_OBJECTS(iter); + PyArrayObject **operands = NIT_OPERANDS(iter); npy_intp *strides = NBF_STRIDES(bufferdata), *ad_strides = NAD_STRIDES(axisdata); char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata); @@ -4929,14 +5007,14 @@ NpyIter_DebugPrint(NpyIter *iter) } printf("Operands: "); for (iiter = 0; iiter < niter; ++iiter) { - printf("%p ", NIT_OBJECTS(iter)[iiter]); + printf("%p ", NIT_OPERANDS(iter)[iiter]); } printf("\n"); printf("Operand DTypes: "); for (iiter = 0; iiter < niter; ++iiter) { PyArray_Descr *dtype; - if (NIT_OBJECTS(iter)[iiter] != NULL) { - dtype = PyArray_DESCR(NIT_OBJECTS(iter)[iiter]); + if (NIT_OPERANDS(iter)[iiter] != NULL) { + dtype = PyArray_DESCR(NIT_OPERANDS(iter)[iiter]); if (dtype != NULL) PyObject_Print((PyObject *)dtype, stdout, 0); else diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py index 3fe9ebd23..8bd7d45ae 100644 --- a/numpy/core/tests/test_numeric.py +++ b/numpy/core/tests/test_numeric.py @@ -283,72 +283,139 @@ class TestEinSum(TestCase): assert_(b.base is a) assert_equal(b, a.swapaxes(0,1)) - def test_einsum_sums(self): + def check_einsum_sums(self, dtype): # sum(a, axis=-1) - a = np.arange(10) + a = np.arange(10, dtype=dtype) assert_equal(np.einsum("i->", a), np.sum(a, axis=-1)) - a = np.arange(24).reshape(2,3,4) + a = np.arange(24, dtype=dtype).reshape(2,3,4) assert_equal(np.einsum("i->", a), np.sum(a, axis=-1)) # sum(a, axis=0) - a = np.arange(10) + a = np.arange(10, dtype=dtype) assert_equal(np.einsum("i...->", a), np.sum(a, axis=0)) - a = np.arange(24).reshape(2,3,4) + a = np.arange(24, dtype=dtype).reshape(2,3,4) assert_equal(np.einsum("i...->", a), np.sum(a, axis=0)) # trace(a) - a = np.arange(25).reshape(5,5) + a = np.arange(25, dtype=dtype).reshape(5,5) assert_equal(np.einsum("ii", a), np.trace(a)) # multiply(a, b) - a = np.arange(12).reshape(3,4) - b = np.arange(24).reshape(2,3,4) + a = np.arange(12, dtype=dtype).reshape(3,4) + b = np.arange(24, dtype=dtype).reshape(2,3,4) assert_equal(np.einsum(",", a, b), np.multiply(a, b)) # inner(a,b) - a = np.arange(24).reshape(2,3,4) - b = np.arange(4) + a = np.arange(24, dtype=dtype).reshape(2,3,4) + b = np.arange(4, dtype=dtype) assert_equal(np.einsum("i,i", a, b), np.inner(a, b)) - a = np.arange(24).reshape(2,3,4) - b = np.arange(2) + a = np.arange(24, dtype=dtype).reshape(2,3,4) + b = np.arange(2, dtype=dtype) assert_equal(np.einsum("i...,i...", a, b), np.inner(a.T, b.T).T) # outer(a,b) - a = np.arange(3)+1 - b = np.arange(4)+1 + a = np.arange(3, dtype=dtype)+1 + b = np.arange(4, dtype=dtype)+1 assert_equal(np.einsum("i,j", a, b), np.outer(a, b)) # matvec(a,b) / a.dot(b) where a is matrix, b is vector - a = np.arange(20).reshape(4,5) - b = np.arange(5) + a = np.arange(20, dtype=dtype).reshape(4,5) + b = np.arange(5, dtype=dtype) assert_equal(np.einsum("ij,j", a, b), np.dot(a, b)) - a = np.arange(20).reshape(4,5) - b = np.arange(5) + a = np.arange(20, dtype=dtype).reshape(4,5) + b = np.arange(5, dtype=dtype) assert_equal(np.einsum("ji,j", a.T, b.T), np.dot(b.T, a.T)) # matmat(a,b) / a.dot(b) where a is matrix, b is matrix - a = np.arange(20).reshape(4,5) - b = np.arange(30).reshape(5,6) + a = np.arange(20, dtype=dtype).reshape(4,5) + b = np.arange(30, dtype=dtype).reshape(5,6) assert_equal(np.einsum("ij,jk", a, b), np.dot(a, b)) # tensordot(a, b) - a = np.arange(60.).reshape(3,4,5) - b = np.arange(24.).reshape(4,3,2) - assert_equal(np.einsum("ijk,jil->kl", a, b), - np.tensordot(a,b, axes=([1,0],[0,1]))) + if np.dtype(dtype) != np.dtype('f2'): + a = np.arange(60, dtype=dtype).reshape(3,4,5) + b = np.arange(24, dtype=dtype).reshape(4,3,2) + assert_equal(np.einsum("ijk,jil->kl", a, b), + np.tensordot(a,b, axes=([1,0],[0,1]))) # logical_and(logical_and(a!=0, b!=0), c!=0) - a = np.array([1, 3, -2, 0, 12, 13, 0, 1]) - b = np.array([0, 3.5, 0., -2, 0, 1, 3, 12]) + a = np.array([1, 3, -2, 0, 12, 13, 0, 1], dtype=dtype) + b = np.array([0, 3.5, 0., -2, 0, 1, 3, 12], dtype=dtype) c = np.array([True,True,False,True,True,False,True,True]) assert_equal(np.einsum("i,i,i->i", a, b, c, dtype='?', casting='unsafe'), logical_and(logical_and(a!=0, b!=0), c!=0)) + a = np.arange(9, dtype=dtype) + assert_equal(np.einsum(",i->", 3, a), 3*np.sum(a)) + assert_equal(np.einsum("i,->", a, 3), 3*np.sum(a)) + + # Various stride0, contiguous, and SSE aligned variants + a = np.arange(64, dtype=dtype) + if np.dtype(dtype).itemsize > 1: + assert_equal(np.einsum(",",a,a), np.multiply(a,a)) + assert_equal(np.einsum("i,i", a, a), np.dot(a,a)) + assert_equal(np.einsum("i,->i", a, 2), 2*a) + assert_equal(np.einsum(",i->i", 2, a), 2*a) + assert_equal(np.einsum("i,->", a, 2), 2*np.sum(a)) + assert_equal(np.einsum(",i->", 2, a), 2*np.sum(a)) + + assert_equal(np.einsum(",",a[1:],a[:-1]), np.multiply(a[1:],a[:-1])) + assert_equal(np.einsum("i,i", a[1:], a[:-1]), np.dot(a[1:],a[:-1])) + assert_equal(np.einsum("i,->i", a[1:], 2), 2*a[1:]) + assert_equal(np.einsum(",i->i", 2, a[1:]), 2*a[1:]) + assert_equal(np.einsum("i,->", a[1:], 2), 2*np.sum(a[1:])) + assert_equal(np.einsum(",i->", 2, a[1:]), 2*np.sum(a[1:])) + + + def test_einsum_sums_int8(self): + self.check_einsum_sums('i1'); + + def test_einsum_sums_uint8(self): + self.check_einsum_sums('u1'); + + def test_einsum_sums_int16(self): + self.check_einsum_sums('i2'); + + def test_einsum_sums_uint16(self): + self.check_einsum_sums('u2'); + + def test_einsum_sums_int32(self): + self.check_einsum_sums('i4'); + + def test_einsum_sums_uint32(self): + self.check_einsum_sums('u4'); + + def test_einsum_sums_int64(self): + self.check_einsum_sums('i8'); + + def test_einsum_sums_uint64(self): + self.check_einsum_sums('u8'); + + def test_einsum_sums_float16(self): + self.check_einsum_sums('f2'); + + def test_einsum_sums_float32(self): + self.check_einsum_sums('f4'); + + def test_einsum_sums_float64(self): + self.check_einsum_sums('f8'); + + def test_einsum_sums_longdouble(self): + self.check_einsum_sums(np.longdouble); + + def test_einsum_sums_cfloat64(self): + self.check_einsum_sums('c8'); + + def test_einsum_sums_cfloat128(self): + self.check_einsum_sums('c16'); + + def test_einsum_sums_clongdouble(self): + self.check_einsum_sums(np.clongdouble); class TestNonarrayArgs(TestCase): # check that non-array arguments to functions wrap them in arrays |