diff options
author | Qiyu8 <fangchunlin@huawei.com> | 2020-12-14 10:16:58 +0800 |
---|---|---|
committer | Qiyu8 <fangchunlin@huawei.com> | 2020-12-14 10:16:58 +0800 |
commit | c32f60e38376e438e1d357d03c5699f4fe9c6649 (patch) | |
tree | 089d3b8e97c099674a2e8c629b9e63b84a6ba270 | |
parent | 9e26d1d2be7a961a16f8fa9ff7820c33b25415e2 (diff) | |
download | numpy-c32f60e38376e438e1d357d03c5699f4fe9c6649.tar.gz |
Optimize the performance of einsum's submodule dot
-rw-r--r-- | numpy/core/src/common/simd/simd.h | 6 | ||||
-rw-r--r-- | numpy/core/src/multiarray/einsum_sumprod.c.src | 184 |
2 files changed, 49 insertions, 141 deletions
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 8804223c9..7705a48ce 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -10,7 +10,11 @@ #include "numpy/npy_common.h" #include "npy_cpu_dispatch.h" #include "simd_utils.h" - +#ifndef NPY_HAVE_AVX2 + #include <immintrin.h> + #define NPY_HAVE_AVX + #define NPY_HAVE_AVX2 +#endif #ifdef __cplusplus extern "C" { #endif diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index caba0e00a..2ef0ab13b 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -597,156 +597,60 @@ static void @type@ *data1 = (@type@ *)dataptr[1]; @temptype@ accum = 0; -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n", (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ +#if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); + const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1); + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. + /**begin repeat2 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + const npy_intp vstepx4 = vstep * 4; + for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) { + /**begin repeat3 + * #i = 0, 1, 2, 3# */ - a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@); + npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@); + /**end repeat3**/ + npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, vaccum); + npyv_@sfx@ ab2 = npyv_muladd_@sfx@(a2, b2, ab3); + npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, ab2); + vaccum = npyv_muladd_@sfx@(a0, b0, ab1); } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ - data0 += 8; - data1 += 8; - } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + /**end repeat2**/ + for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) { + npyv_@sfx@ a = npyv_load_tillz_@sfx@(data0, count); + npyv_@sfx@ b = npyv_load_tillz_@sfx@(data1, count); + vaccum = npyv_muladd_@sfx@(a, b, vaccum); } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@)); - accum_sse = _mm_add_ps(accum_sse, a); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - _mm_prefetch(data1 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); - accum_sse = _mm_add_pd(accum_sse, a); -/**end repeat2**/ + accum = npyv_sum_@sfx@(vaccum); + npyv_cleanup(); #else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]) * @from@(data1[@i@]); -/**end repeat2**/ -#endif - data0 += 8; - data1 += 8; +#ifndef NPY_DISABLE_OPTIMIZATION + for (; count >= 4; count -= 4, data0 += 4, data1 += 4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + const @type@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]); + /**end repeat2**/ + accum += ab0 + ab1 + ab2 + ab3; } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; +#endif // !NPY_DISABLE_OPTIMIZATION + for (; count > 0; --count, ++data0, ++data1) { + const @type@ a = @from@(*data0); + const @type@ b = @from@(*data1); + accum += a * b; + } +#endif // NPYV check for @type@ + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); } static void |