ENH: Use AVX-512 for np.maximum and np.minimum

author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> 2019-10-31 19:07:34 -0700
committer: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> 2020-01-28 08:52:42 -0800
commit: f7f8e621ce4774722968f2f0a77d305cfabf46d5 (patch)
tree: 8df9e27756d8d5032e22a7c10bbba60dafbd753d /numpy
parent: 71ad52e79b1b35ed13dade1c395d7ad57deb9e3a (diff)
download: numpy-f7f8e621ce4774722968f2f0a77d305cfabf46d5.tar.gz
4 files changed, 156 insertions, 2 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index f9ee7d993..1fd08241d 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -492,14 +492,14 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj),
+          TD(noobj, simd=[('avx512f', 'fd')]),
           TD(O, f='npy_ObjectMax')
           ),
 'minimum':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj),
+          TD(noobj, simd=[('avx512f', 'fd')]),
           TD(O, f='npy_ObjectMin')
           ),
 'clip':
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 3b180ce59..b310d73ff 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1898,6 +1898,34 @@ NPY_NO_EXPORT void
  * #OP =  >=, <=#
  **/
 NPY_NO_EXPORT void
+@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*  */
+    if (IS_BINARY_REDUCE) {
+        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+            BINARY_REDUCE_LOOP(@type@) {
+                const @type@ in2 = *(@type@ *)ip2;
+                /* Order of operations important for MSVC 2015 */
+                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
+            }
+            *((@type@ *)iop1) = io1;
+        }
+    }
+    else {
+        if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
+            BINARY_LOOP {
+                @type@ in1 = *(@type@ *)ip1;
+                const @type@ in2 = *(@type@ *)ip2;
+                /* Order of operations important for MSVC 2015 */
+                in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
+                *((@type@ *)op1) = in1;
+            }
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     /*  */
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 8ddf201d7..6c89627ca 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -175,6 +175,14 @@ NPY_NO_EXPORT void
 @TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 /**begin repeat1
+ * #func = maximum, minimum#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+
+/**begin repeat1
  * #isa = avx512f, fma#
  */
 
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 5473b58f1..69f003473 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -130,6 +130,35 @@ abs_ptrdiff(char *a, char *b)
  */
 
 /**begin repeat
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #EXISTS = 1, 1, 0#
+ */
+
+/**begin repeat1
+ *  #func = maximum, minimum#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
+static NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
+#endif
+
+static NPY_INLINE int
+run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
+    AVX512F_@func@_@TYPE@(args, dimensions, steps);
+    return 1;
+#endif
+    return 0;
+}
+
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
  * #ISA = FMA, AVX512F#
  * #isa = fma, avx512f#
  * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
@@ -1671,6 +1700,95 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
 #endif
 /**end repeat**/
 
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #num_lanes = 16, 8#
+ * #vsuffix = ps, pd#
+ * #mask = __mmask16, __mmask8#
+ * #vtype = __m512, __m512d#
+ * #scale = 4, 8#
+ * #vindextype = __m512i, __m256i#
+ * #vindexsize = 512, 256#
+ * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
+ */
+
+/**begin repeat1
+ *  #func = maximum, minimum#
+ *  #vectorf = max, min#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    const npy_intp stride_ip1 = steps[0]/sizeof(@type@);
+    const npy_intp stride_ip2 = steps[1]/sizeof(@type@);
+    const npy_intp stride_op = steps[2]/sizeof(@type@);
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = array_size;
+    @type@* ip1 = (@type@*) args[0];
+    @type@* ip2 = (@type@*) args[1];
+    @type@* op  = (@type@*) args[2];
+
+    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+    npy_int index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
+    for (npy_int ii = 0; ii < @num_lanes@; ii++) {
+        index_ip1[ii] = ii*stride_ip1;
+        index_ip2[ii] = ii*stride_ip2;
+        index_op[ii] = ii*stride_op;
+    }
+    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
+    @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
+    @vindextype@ vindex_op  = @vindexload@((@vindextype@*)&index_op[0]);
+    @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+        }
+        @vtype@ x1, x2;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+        }
+        else {
+            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
+        }
+        if (stride_ip2 == 1) {
+            x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
+        }
+        else {
+            x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask);
+        }
+
+        /*
+         * when only one of the argument is a nan, the maxps/maxpd instruction
+         * returns the second argument. The additional blend instruction fixes
+         * this issue to conform with NumPy behaviour.
+         */
+        @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ);
+        @vtype@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
+        out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1);
+
+        if (stride_op == 1) {
+            _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+        }
+        else {
+            /* scatter! */
+            _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
+        }
+
+        ip1 += @num_lanes@*stride_ip1;
+        ip2 += @num_lanes@*stride_ip2;
+        op += @num_lanes@*stride_op;
+        num_remaining_elements -= @num_lanes@;
+    }
+}
+#endif
+/**end repeat**/
+/**end repeat1**/
 
 /**begin repeat
  * #ISA = FMA, AVX512F#
author	Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>	2019-10-31 19:07:34 -0700
committer	Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>	2020-01-28 08:52:42 -0800
commit	f7f8e621ce4774722968f2f0a77d305cfabf46d5 (patch)
tree	8df9e27756d8d5032e22a7c10bbba60dafbd753d /numpy
parent	71ad52e79b1b35ed13dade1c395d7ad57deb9e3a (diff)
download	numpy-f7f8e621ce4774722968f2f0a77d305cfabf46d5.tar.gz