diff options
author | Sayed Adel <seiko@imavr.com> | 2021-12-31 04:16:00 +0200 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2021-12-31 04:59:56 +0200 |
commit | 1b55815e90009beaa064856b57575c267f297699 (patch) | |
tree | 490e3377c4f7214c5650b1f257fa34cf252d381c /numpy | |
parent | 91a4b980b2381c77faede2f936da4bc9831802cd (diff) | |
download | numpy-1b55815e90009beaa064856b57575c267f297699.tar.gz |
ENH: remove raw x86 SIMD of max/min
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 33 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 188 |
2 files changed, 3 insertions, 218 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 6076e0b2d..5f054d0a9 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1658,39 +1658,6 @@ NPY_NO_EXPORT void } } -/**begin repeat1 - * #kind = maximum, minimum# - * #OP = >=, <=# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* */ - if (IS_BINARY_REDUCE) { - if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - } - else { - if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2; - *((@type@ *)op1) = in1; - } - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} -/**end repeat1**/ - NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 0e2c1ab8b..8b833ee56 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -95,38 +95,6 @@ run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const n */ /**begin repeat1 - * #func = maximum, minimum# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ -static NPY_INLINE NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps); -#endif - -static NPY_INLINE int -run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ - if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) { - AVX512F_@func@_@TYPE@(args, dimensions, steps); - return 1; - } - else - return 0; -#endif - return 0; -} -/**end repeat1**/ - -/**end repeat**/ - -/**begin repeat - * #type = npy_float, npy_double, npy_longdouble# - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #EXISTS = 1, 1, 0# - */ - -/**begin repeat1 * #func = isnan, isfinite, isinf, signbit# */ @@ -204,9 +172,9 @@ run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp */ /**begin repeat1 - * #func = negative, minimum, maximum# - * #check = IS_BLOCKABLE_UNARY, IS_BLOCKABLE_REDUCE*2 # - * #name = unary, unary_reduce*2# + * #func = negative# + * #check = IS_BLOCKABLE_UNARY# + * #name = unary# */ #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS @@ -678,55 +646,6 @@ sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n) } /**end repeat1**/ - -/**begin repeat1 - * #kind = maximum, minimum# - * #VOP = max, min# - * #OP = >=, <=# - **/ -/* arguments swapped as unary reduce has the swapped compared to unary */ -static void -sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) -{ - const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@); - LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) { - /* Order of operations important for MSVC 2015 */ - *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i]; - } - assert(n < stride || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)); - if (i + 3 * stride <= n) { - /* load the first elements */ - @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]); - @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]); - i += 2 * stride; - - /* minps/minpd will set invalid flag if nan is encountered */ - npy_clear_floatstatus_barrier((char*)&c1); - LOOP_BLOCKED(@type@, 2 * VECTOR_SIZE_BYTES) { - @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]); - @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]); - c1 = @vpre@_@VOP@_@vsuf@(c1, v1); - c2 = @vpre@_@VOP@_@vsuf@(c2, v2); - } - c1 = @vpre@_@VOP@_@vsuf@(c1, c2); - - if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) { - *op = @nan@; - } - else { - @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c1); - /* Order of operations important for MSVC 2015 */ - *op = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp; - } - } - LOOP_BLOCKED_END { - /* Order of operations important for MSVC 2015 */ - *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i]; - } - npy_clear_floatstatus_barrier((char*)op); -} -/**end repeat1**/ - /**end repeat**/ /* bunch of helper functions used in ISA_exp/log_FLOAT*/ @@ -1200,107 +1119,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co /**end repeat**/ /**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #num_lanes = 16, 8# - * #vsuffix = ps, pd# - * #mask = __mmask16, __mmask8# - * #vtype1 = __m512, __m512d# - * #vtype2 = __m512i, __m256i# - * #scale = 4, 8# - * #vindextype = __m512i, __m256i# - * #vindexsize = 512, 256# - * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# - * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32# - * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32# - * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32# - * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32# - * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256# - */ -/**begin repeat1 - * #func = maximum, minimum# - * #vectorf = max, min# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_INLINE NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@); - const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(@type@); - const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@); - const npy_intp array_size = dimensions[0]; - npy_intp num_remaining_elements = array_size; - @type@* ip1 = (@type@*) args[0]; - @type@* ip2 = (@type@*) args[1]; - @type@* op = (@type@*) args[2]; - - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via - * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP - */ - - npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@]; - for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { - index_ip1[ii] = ii*stride_ip1; - index_ip2[ii] = ii*stride_ip2; - index_op[ii] = ii*stride_op; - } - @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]); - @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]); - @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]); - @vtype1@ zeros_f = _mm512_setzero_@vsuffix@(); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - } - @vtype1@ x1, x2; - if (stride_ip1 == 1) { - x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); - } - else { - x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask); - } - if (stride_ip2 == 1) { - x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); - } - else { - x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask); - } - - /* - * when only one of the argument is a nan, the maxps/maxpd instruction - * returns the second argument. The additional blend instruction fixes - * this issue to conform with NumPy behaviour. - */ - @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ); - @vtype1@ out = _mm512_@vectorf@_@vsuffix@(x1, x2); - out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1); - - if (stride_op == 1) { - _mm512_mask_storeu_@vsuffix@(op, load_mask, out); - } - else { - /* scatter! */ - _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@); - } - - ip1 += @num_lanes@*stride_ip1; - ip2 += @num_lanes@*stride_ip2; - op += @num_lanes@*stride_op; - num_remaining_elements -= @num_lanes@; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * #ISA = FMA, AVX512F# * #isa = fma, avx512# * #vsize = 256, 512# |