summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2021-12-31 04:16:00 +0200
committerSayed Adel <seiko@imavr.com>2021-12-31 04:59:56 +0200
commit1b55815e90009beaa064856b57575c267f297699 (patch)
tree490e3377c4f7214c5650b1f257fa34cf252d381c /numpy
parent91a4b980b2381c77faede2f936da4bc9831802cd (diff)
downloadnumpy-1b55815e90009beaa064856b57575c267f297699.tar.gz
ENH: remove raw x86 SIMD of max/min
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/loops.c.src33
-rw-r--r--numpy/core/src/umath/simd.inc.src188
2 files changed, 3 insertions, 218 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 6076e0b2d..5f054d0a9 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1658,39 +1658,6 @@ NPY_NO_EXPORT void
}
}
-/**begin repeat1
- * #kind = maximum, minimum#
- * #OP = >=, <=#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- /* */
- if (IS_BINARY_REDUCE) {
- if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
- BINARY_REDUCE_LOOP(@type@) {
- const @type@ in2 = *(@type@ *)ip2;
- /* Order of operations important for MSVC 2015 */
- io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
- }
- *((@type@ *)iop1) = io1;
- }
- }
- else {
- if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
- BINARY_LOOP {
- @type@ in1 = *(@type@ *)ip1;
- const @type@ in2 = *(@type@ *)ip2;
- /* Order of operations important for MSVC 2015 */
- in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
- *((@type@ *)op1) = in1;
- }
- }
- }
- npy_clear_floatstatus_barrier((char*)dimensions);
-}
-/**end repeat1**/
-
NPY_NO_EXPORT void
@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 0e2c1ab8b..8b833ee56 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -95,38 +95,6 @@ run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const n
*/
/**begin repeat1
- * #func = maximum, minimum#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-static NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
-#endif
-
-static NPY_INLINE int
-run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
- if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
- AVX512F_@func@_@TYPE@(args, dimensions, steps);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
-}
-/**end repeat1**/
-
-/**end repeat**/
-
-/**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #EXISTS = 1, 1, 0#
- */
-
-/**begin repeat1
* #func = isnan, isfinite, isinf, signbit#
*/
@@ -204,9 +172,9 @@ run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
*/
/**begin repeat1
- * #func = negative, minimum, maximum#
- * #check = IS_BLOCKABLE_UNARY, IS_BLOCKABLE_REDUCE*2 #
- * #name = unary, unary_reduce*2#
+ * #func = negative#
+ * #check = IS_BLOCKABLE_UNARY#
+ * #name = unary#
*/
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -678,55 +646,6 @@ sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
}
/**end repeat1**/
-
-/**begin repeat1
- * #kind = maximum, minimum#
- * #VOP = max, min#
- * #OP = >=, <=#
- **/
-/* arguments swapped as unary reduce has the swapped compared to unary */
-static void
-sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
-{
- const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@);
- LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) {
- /* Order of operations important for MSVC 2015 */
- *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
- }
- assert(n < stride || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES));
- if (i + 3 * stride <= n) {
- /* load the first elements */
- @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
- @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
- i += 2 * stride;
-
- /* minps/minpd will set invalid flag if nan is encountered */
- npy_clear_floatstatus_barrier((char*)&c1);
- LOOP_BLOCKED(@type@, 2 * VECTOR_SIZE_BYTES) {
- @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
- @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
- c1 = @vpre@_@VOP@_@vsuf@(c1, v1);
- c2 = @vpre@_@VOP@_@vsuf@(c2, v2);
- }
- c1 = @vpre@_@VOP@_@vsuf@(c1, c2);
-
- if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) {
- *op = @nan@;
- }
- else {
- @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c1);
- /* Order of operations important for MSVC 2015 */
- *op = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp;
- }
- }
- LOOP_BLOCKED_END {
- /* Order of operations important for MSVC 2015 */
- *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
- }
- npy_clear_floatstatus_barrier((char*)op);
-}
-/**end repeat1**/
-
/**end repeat**/
/* bunch of helper functions used in ISA_exp/log_FLOAT*/
@@ -1200,107 +1119,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
/**end repeat**/
/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #mask = __mmask16, __mmask8#
- * #vtype1 = __m512, __m512d#
- * #vtype2 = __m512i, __m256i#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexsize = 512, 256#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32#
- * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32#
- * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32#
- * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
- * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
- */
-/**begin repeat1
- * #func = maximum, minimum#
- * #vectorf = max, min#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
- const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
- const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(@type@);
- const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
- const npy_intp array_size = dimensions[0];
- npy_intp num_remaining_elements = array_size;
- @type@* ip1 = (@type@*) args[0];
- @type@* ip2 = (@type@*) args[1];
- @type@* op = (@type@*) args[2];
-
- @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum index
- * will fit in an int32 as a precondition for this function via
- * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
- */
-
- npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
- for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
- index_ip1[ii] = ii*stride_ip1;
- index_ip2[ii] = ii*stride_ip2;
- index_op[ii] = ii*stride_op;
- }
- @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
- @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
- @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
- @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- }
- @vtype1@ x1, x2;
- if (stride_ip1 == 1) {
- x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
- }
- else {
- x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
- }
- if (stride_ip2 == 1) {
- x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
- }
- else {
- x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask);
- }
-
- /*
- * when only one of the argument is a nan, the maxps/maxpd instruction
- * returns the second argument. The additional blend instruction fixes
- * this issue to conform with NumPy behaviour.
- */
- @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ);
- @vtype1@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
- out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1);
-
- if (stride_op == 1) {
- _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
- }
- else {
- /* scatter! */
- _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
- }
-
- ip1 += @num_lanes@*stride_ip1;
- ip2 += @num_lanes@*stride_ip2;
- op += @num_lanes@*stride_op;
- num_remaining_elements -= @num_lanes@;
- }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
* #ISA = FMA, AVX512F#
* #isa = fma, avx512#
* #vsize = 256, 512#