summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/code_generators/generate_umath.py4
-rw-r--r--numpy/core/src/umath/loops.c.src28
-rw-r--r--numpy/core/src/umath/loops.h.src8
-rw-r--r--numpy/core/src/umath/simd.inc.src118
4 files changed, 156 insertions, 2 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index f9ee7d993..1fd08241d 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -492,14 +492,14 @@ defdict = {
Ufunc(2, 1, ReorderableNone,
docstrings.get('numpy.core.umath.maximum'),
'PyUFunc_SimpleUniformOperationTypeResolver',
- TD(noobj),
+ TD(noobj, simd=[('avx512f', 'fd')]),
TD(O, f='npy_ObjectMax')
),
'minimum':
Ufunc(2, 1, ReorderableNone,
docstrings.get('numpy.core.umath.minimum'),
'PyUFunc_SimpleUniformOperationTypeResolver',
- TD(noobj),
+ TD(noobj, simd=[('avx512f', 'fd')]),
TD(O, f='npy_ObjectMin')
),
'clip':
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 3b180ce59..b310d73ff 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1898,6 +1898,34 @@ NPY_NO_EXPORT void
* #OP = >=, <=#
**/
NPY_NO_EXPORT void
+@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ /* */
+ if (IS_BINARY_REDUCE) {
+ if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+ BINARY_REDUCE_LOOP(@type@) {
+ const @type@ in2 = *(@type@ *)ip2;
+ /* Order of operations important for MSVC 2015 */
+ io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
+ }
+ *((@type@ *)iop1) = io1;
+ }
+ }
+ else {
+ if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
+ BINARY_LOOP {
+ @type@ in1 = *(@type@ *)ip1;
+ const @type@ in2 = *(@type@ *)ip2;
+ /* Order of operations important for MSVC 2015 */
+ in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
+ *((@type@ *)op1) = in1;
+ }
+ }
+ }
+ npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
/* */
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 8ddf201d7..6c89627ca 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -175,6 +175,14 @@ NPY_NO_EXPORT void
@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**begin repeat1
+ * #func = maximum, minimum#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+
+/**begin repeat1
* #isa = avx512f, fma#
*/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 5473b58f1..69f003473 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -130,6 +130,35 @@ abs_ptrdiff(char *a, char *b)
*/
/**begin repeat
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #EXISTS = 1, 1, 0#
+ */
+
+/**begin repeat1
+ * #func = maximum, minimum#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
+static NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
+#endif
+
+static NPY_INLINE int
+run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
+ AVX512F_@func@_@TYPE@(args, dimensions, steps);
+ return 1;
+#endif
+ return 0;
+}
+
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
* #ISA = FMA, AVX512F#
* #isa = fma, avx512f#
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
@@ -1671,6 +1700,95 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
#endif
/**end repeat**/
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #num_lanes = 16, 8#
+ * #vsuffix = ps, pd#
+ * #mask = __mmask16, __mmask8#
+ * #vtype = __m512, __m512d#
+ * #scale = 4, 8#
+ * #vindextype = __m512i, __m256i#
+ * #vindexsize = 512, 256#
+ * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
+ */
+
+/**begin repeat1
+ * #func = maximum, minimum#
+ * #vectorf = max, min#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+ const npy_intp stride_ip1 = steps[0]/sizeof(@type@);
+ const npy_intp stride_ip2 = steps[1]/sizeof(@type@);
+ const npy_intp stride_op = steps[2]/sizeof(@type@);
+ const npy_intp array_size = dimensions[0];
+ npy_intp num_remaining_elements = array_size;
+ @type@* ip1 = (@type@*) args[0];
+ @type@* ip2 = (@type@*) args[1];
+ @type@* op = (@type@*) args[2];
+
+ @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+ npy_int index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
+ for (npy_int ii = 0; ii < @num_lanes@; ii++) {
+ index_ip1[ii] = ii*stride_ip1;
+ index_ip2[ii] = ii*stride_ip2;
+ index_op[ii] = ii*stride_op;
+ }
+ @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
+ @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
+ @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
+ @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
+
+ while (num_remaining_elements > 0) {
+ if (num_remaining_elements < @num_lanes@) {
+ load_mask = avx512_get_partial_load_mask_@vsuffix@(
+ num_remaining_elements, @num_lanes@);
+ }
+ @vtype@ x1, x2;
+ if (stride_ip1 == 1) {
+ x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+ }
+ else {
+ x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
+ }
+ if (stride_ip2 == 1) {
+ x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
+ }
+ else {
+ x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask);
+ }
+
+ /*
+ * when only one of the argument is a nan, the maxps/maxpd instruction
+ * returns the second argument. The additional blend instruction fixes
+ * this issue to conform with NumPy behaviour.
+ */
+ @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ);
+ @vtype@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
+ out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1);
+
+ if (stride_op == 1) {
+ _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+ }
+ else {
+ /* scatter! */
+ _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
+ }
+
+ ip1 += @num_lanes@*stride_ip1;
+ ip2 += @num_lanes@*stride_ip2;
+ op += @num_lanes@*stride_op;
+ num_remaining_elements -= @num_lanes@;
+ }
+}
+#endif
+/**end repeat**/
+/**end repeat1**/
/**begin repeat
* #ISA = FMA, AVX512F#