Merge pull request #15408 from r-devulap/cmplx-simd

ENH: Use AVX-512F for complex number arithmetic, absolute, square and conjugate
author: Matti Picus <matti.picus@gmail.com> 2020-02-05 14:04:33 +0200
committer: GitHub <noreply@github.com> 2020-02-05 14:04:33 +0200
commit: c4276e2183684c0763f61342022231481cffb329 (patch)
tree: 61cbbc3291a6c0753815284e2912ffe1df9ab916 /numpy/core
parent: 32ce6b8cea5b464ea9b008e2b0cc3d86615a1bd5 (diff)
parent: d5b4b721cce90adea3592c126087f1fbe489784e (diff)
download: numpy-c4276e2183684c0763f61342022231481cffb329.tar.gz
5 files changed, 456 insertions, 16 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index f22380ccb..c14711d16 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -233,6 +233,7 @@ flts = 'efdg'
 fltsO = flts + O
 fltsP = flts + P
 cmplx = 'FDG'
+cmplxvec = 'FD'
 cmplxO = cmplx + O
 cmplxP = cmplx + P
 inexact = flts + cmplx
@@ -268,7 +269,7 @@ defdict = {
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
           'PyUFunc_AdditionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -279,7 +280,7 @@ defdict = {
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(ints + inexact, simd=[('avx2', ints)]),
+          TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -290,7 +291,7 @@ defdict = {
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
           'PyUFunc_MultiplicationTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -325,7 +326,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.conjugate'),
           None,
-          TD(ints+flts+cmplx, simd=[('avx2', ints)]),
+          TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]),
           TD(P, f='conjugate'),
           ),
 'fmod':
@@ -340,7 +341,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
@@ -378,7 +379,7 @@ defdict = {
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
           TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
-          TD(cmplx, out=('f', 'd', 'g')),
+          TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
 '_arg':
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index b310d73ff..9b43824cb 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -2509,6 +2509,7 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps,
  * #ftype = npy_float, npy_double, npy_longdouble#
  * #c = f, , l#
  * #C = F, , L#
+ * #SIMD = 1, 1, 0#
  */
 
 /* similar to pairwise sum of real floats */
@@ -2584,6 +2585,7 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
     }
 }
 
+
 /**begin repeat1
  * arithmetic
  * #kind = add, subtract#
@@ -2662,6 +2664,32 @@ NPY_NO_EXPORT void
     }
 }
 
+#if @SIMD@
+NPY_NO_EXPORT void
+@TYPE@_add_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
+{
+    if (IS_BINARY_REDUCE) {
+        @TYPE@_add(args, dimensions, steps, func);
+    }
+    else if (!run_binary_avx512f_add_@TYPE@(args, dimensions, steps)) {
+        @TYPE@_add(args, dimensions, steps, func);
+    }
+}
+
+/**begin repeat1
+ * arithmetic
+ * #kind = subtract, multiply#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
+{
+    if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
+        @TYPE@_@kind@(args, dimensions, steps, func);
+    }
+}
+/**end repeat1**/
+#endif
+
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -2819,6 +2847,21 @@ NPY_NO_EXPORT void
     }
 }
 
+#if @SIMD@
+/**begin repeat1
+ * arithmetic
+ * #kind = conjugate, square, absolute#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
+{
+    if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
+        @TYPE@_@kind@(args, dimensions, steps, func);
+    }
+}
+/**end repeat1**/
+#endif
+
 NPY_NO_EXPORT void
 @TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 6c89627ca..e9d0b4c62 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -356,20 +356,27 @@ NPY_NO_EXPORT void
  * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  * #c = f, , l#
  * #C = F, , L#
+ * #IFSIMD = 1, 1, 0#
  */
 
 /**begin repeat1
+ * #isa = , _avx512f#
+ */
+
+/**begin repeat2
  * arithmetic
  * #kind = add, subtract#
  * #OP = +, -#
  */
+
 NPY_NO_EXPORT void
-C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-/**end repeat1**/
+/**end repeat2**/
 
 NPY_NO_EXPORT void
-C@TYPE@_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_multiply@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
 
 NPY_NO_EXPORT void
 C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -409,19 +416,24 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v
 /**end repeat1**/
 
 NPY_NO_EXPORT void
-C@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
 C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
 
+/**begin repeat1
+ * #isa = , _avx512f#
+ */
+
 NPY_NO_EXPORT void
-C@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-C@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+/**end repeat1**/
 
 NPY_NO_EXPORT void
 C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -444,7 +456,6 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v
 NPY_NO_EXPORT void
 C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
-
 #define C@TYPE@_true_divide C@TYPE@_divide
 
 /**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index cd485034e..7ec90f9c8 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -55,6 +55,13 @@ abs_ptrdiff(char *a, char *b)
     return (a > b) ? (a - b) : (b - a);
 }
 
+#define IS_BINARY_STRIDE_ONE(esize, vsize) \
+    ((steps[0] == esize) && \
+     (steps[1] == esize) && \
+     (steps[2] == esize) && \
+     (abs_ptrdiff(args[2], args[0]) >= vsize) && \
+     (abs_ptrdiff(args[2], args[1]) >= vsize))
+
 /*
  * stride is equal to element size and input and destination are equal or
  * don't overlap within one register. The check of the steps against
@@ -158,6 +165,71 @@ abs_ptrdiff(char *a, char *b)
 
 /*
  *****************************************************************************
+ **                           CMPLX DISPATCHERS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #type= npy_float, npy_double#
+ * #esize = 8, 16#
+ */
+
+/**begin repeat1
+ *  #func = add, subtract, multiply#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps);
+#endif
+
+static NPY_INLINE int
+run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+    if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
+        AVX512F_@func@_@TYPE@(args, dimensions, steps);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
+/**end repeat1**/
+
+/**begin repeat1
+ *  #func = square, absolute, conjugate#
+ *  #outsize = 1, 2, 1#
+ *  #max_stride = 2, 8, 8#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride);
+#endif
+
+static NPY_INLINE int
+run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+    if ((IS_OUTPUT_BLOCKABLE_UNARY((npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) {
+        AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
  **                           FLOAT DISPATCHERS
  *****************************************************************************
  */
@@ -1591,9 +1663,17 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
 }
 /**begin repeat
  *  #vsub  = ps, pd#
+ *  #type= npy_float, npy_double#
  *  #epi_vsub  = epi32, epi64#
  *  #vtype = __m512, __m512d#
+ *  #mask = __mmask16, __mmask8#
  *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
+ *  #neg_mask = 0x80000000, 0x8000000000000000#
+ *  #perm_ = 0xb1, 0x55#
+ *  #cmpx_img_mask = 0xAAAA, 0xAA#
+ *  #cmpx_re_mask = 0x5555, 0x55#
+ *  #INF = NPY_INFINITYF, NPY_INFINITY#
+ *  #NAN = NPY_NANF, NPY_NAN#
  */
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_abs_@vsub@(@vtype@ x)
@@ -1631,6 +1711,96 @@ avx512_trunc_@vsub@(@vtype@ x)
 {
     return _mm512_roundscale_@vsub@(x, 0x0B);
 }
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_hadd_@vsub@(const @vtype@ x)
+{
+    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_hsub_@vsub@(const @vtype@ x)
+{
+    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_cabsolute_@vsub@(const @vtype@ x1,
+                        const @vtype@ x2,
+                        const __m512i re_indices,
+                        const __m512i im_indices)
+{
+    @vtype@ inf = _mm512_set1_@vsub@(@INF@);
+    @vtype@ nan = _mm512_set1_@vsub@(@NAN@);
+    @vtype@ x1_abs = avx512_abs_@vsub@(x1);
+    @vtype@ x2_abs = avx512_abs_@vsub@(x2);
+    @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs);
+    @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ);
+    @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ);
+    im = _mm512_mask_mov_@vsub@(im, re_infmask, inf);
+    re = _mm512_mask_mov_@vsub@(re, im_infmask, inf);
+
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ);
+    @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ);
+    im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan);
+    re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan);
+
+    @vtype@ larger  = _mm512_max_@vsub@(re, im);
+    @vtype@ smaller = _mm512_min_@vsub@(im, re);
+
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ);
+    @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ);
+    @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask));
+    @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger);
+    @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@(
+                                        ratio, ratio, _mm512_set1_@vsub@(1.0f)));
+    return _mm512_mul_@vsub@(hypot, larger);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_conjugate_@vsub@(const @vtype@ x)
+{
+    /*
+     * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and
+     * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction
+     */
+    __m512i cast_x = _mm512_cast@vsub@_si512(x);
+    __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@,
+                                        cast_x, _mm512_set1_@epi_vsub@(@neg_mask@));
+    return _mm512_castsi512_@vsub@(res);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
+{
+    // x1 = r1, i1
+    // x2 = r2, i2
+    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
+    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
+    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
+    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
+    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
+    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_csquare_@vsub@(@vtype@ x)
+{
+    return avx512_cmul_@vsub@(x, x);
+}
+
 /**end repeat**/
 #endif
 
@@ -2450,6 +2620,184 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 #endif
 /**end repeat**/
 
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #type = npy_float, npy_double#
+ * #num_lanes = 16, 8#
+ * #vsuffix = ps, pd#
+ * #epi_vsub  = epi32, epi64#
+ * #mask = __mmask16, __mmask8#
+ * #vtype = __m512, __m512d#
+ * #scale = 4, 8#
+ * #vindextype = __m512i, __m256i#
+ * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
+ * #storemask = 0xFF, 0xF#
+ * #IS_FLOAT = 1, 0#
+ */
+
+/**begin repeat1
+ *  #func = add, subtract, multiply#
+ *  #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = 2*array_size;
+    @type@* ip1 = (@type@*) args[0];
+    @type@* ip2 = (@type@*) args[1];
+    @type@* op  = (@type@*) args[2];
+
+    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+        }
+        @vtype@ x1, x2;
+        x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+        x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
+
+        @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
+
+        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+
+        ip1 += @num_lanes@;
+        ip2 += @num_lanes@;
+        op += @num_lanes@;
+        num_remaining_elements -= @num_lanes@;
+    }
+}
+#endif
+/**end repeat1**/
+
+/**begin repeat1
+ *  #func = square, conjugate#
+ *  #vectorf = avx512_csquare, avx512_conjugate#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(@type@ * op,
+                      @type@ * ip,
+                      const npy_intp array_size,
+                      const npy_intp steps)
+{
+    npy_intp num_remaining_elements = 2*array_size;
+    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
+
+     /*
+      * Note: while generally indices are npy_intp, we ensure that our maximum index
+      * will fit in an int32 as a precondition for this function via max_stride
+      */
+    npy_int32 index_ip1[16];
+    for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) {
+        index_ip1[ii] = ii*stride_ip1;
+        index_ip1[ii+1] = ii*stride_ip1 + 1;
+    }
+    @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1);
+    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+    @vtype@ zeros = _mm512_setzero_@vsuffix@();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+        }
+        @vtype@ x1;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
+        }
+        else {
+            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask);
+        }
+
+        @vtype@ out = @vectorf@_@vsuffix@(x1);
+
+        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+        op += @num_lanes@;
+        ip += @num_lanes@*stride_ip1;
+        num_remaining_elements -= @num_lanes@;
+    }
+}
+#endif
+/**end repeat1**/
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_absolute_@TYPE@(@type@ * op,
+                        @type@ * ip,
+                        const npy_intp array_size,
+                        const npy_intp steps)
+{
+    npy_intp num_remaining_elements = 2*array_size;
+    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via max_stride
+     */
+    npy_int32 index_ip[32];
+    for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) {
+        index_ip[ii] = ii*stride_ip1;
+        index_ip[ii+1] = ii*stride_ip1 + 1;
+    }
+    @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip);
+    @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@));
+
+    @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@();
+    @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@();
+    @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@();
+    @vtype@ zeros = _mm512_setzero_@vsuffix@();
+
+#if @IS_FLOAT@
+    __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0);
+    __m512i im_index  = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1);
+#else
+    __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0);
+    __m512i im_index  = _mm512_set_epi64(15,13,11,9,7,5,3,1);
+#endif
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask1 = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+            load_mask2 = 0x0000;
+            store_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements/2, @num_lanes@);
+        } else if (num_remaining_elements < 2*@num_lanes@) {
+            load_mask1 = avx512_get_full_load_mask_@vsuffix@();
+            load_mask2 = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements - @num_lanes@, @num_lanes@);
+            store_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements/2, @num_lanes@);
+        }
+        @vtype@ x1, x2;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_@vsuffix@(load_mask1, ip);
+            x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@);
+        }
+        else {
+            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1);
+            x2  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2);
+        }
+
+        @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index);
+
+        _mm512_mask_storeu_@vsuffix@(op, store_mask, out);
+        op += @num_lanes@;
+        ip += 2*@num_lanes@*stride_ip1;
+        num_remaining_elements -= 2*@num_lanes@;
+    }
+    npy_clear_floatstatus_barrier((char*)op);
+}
+
+#endif
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                           BOOL LOOPS
diff --git a/numpy/core/tests/test_umath_complex.py b/numpy/core/tests/test_umath_complex.py
index 5e5ced85c..a21158420 100644
--- a/numpy/core/tests/test_umath_complex.py
+++ b/numpy/core/tests/test_umath_complex.py
@@ -6,7 +6,7 @@ import numpy as np
 # import the c-extension module directly since _arg is not exported via umath
 import numpy.core._multiarray_umath as ncu
 from numpy.testing import (
-    assert_raises, assert_equal, assert_array_equal, assert_almost_equal
+    assert_raises, assert_equal, assert_array_equal, assert_almost_equal, assert_array_max_ulp
     )
 
 # TODO: branch cuts (use Pauli code)
@@ -540,3 +540,40 @@ def check_complex_value(f, x1, y1, x2, y2, exact=True):
             assert_equal(f(z1), z2)
         else:
             assert_almost_equal(f(z1), z2)
+
+class TestSpecialComplexAVX(object):
+    @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
+    @pytest.mark.parametrize("astype", [np.complex64, np.complex128])
+    def test_array(self, stride, astype):
+        arr = np.array([np.complex(np.nan , np.nan),
+                        np.complex(np.nan , np.inf),
+                        np.complex(np.inf , np.nan),
+                        np.complex(np.inf , np.inf),
+                        np.complex(0.     , np.inf),
+                        np.complex(np.inf , 0.),
+                        np.complex(0.     , 0.),
+                        np.complex(0.     , np.nan),
+                        np.complex(np.nan , 0.)], dtype=astype)
+        abs_true = np.array([np.nan, np.inf, np.inf, np.inf, np.inf, np.inf, 0., np.nan, np.nan], dtype=arr.real.dtype)
+        sq_true = np.array([np.complex(np.nan,  np.nan),
+                            np.complex(np.nan,  np.nan),
+                            np.complex(np.nan,  np.nan),
+                            np.complex(np.nan,  np.inf),
+                            np.complex(-np.inf, np.nan),
+                            np.complex(np.inf,  np.nan),
+                            np.complex(0.,     0.),
+                            np.complex(np.nan, np.nan),
+                            np.complex(np.nan, np.nan)], dtype=astype)
+        assert_equal(np.abs(arr[::stride]), abs_true[::stride])
+        with np.errstate(invalid='ignore'):
+            assert_equal(np.square(arr[::stride]), sq_true[::stride])
+
+class TestComplexAbsoluteAVX(object):
+    @pytest.mark.parametrize("arraysize", [1,2,3,4,5,6,7,8,9,10,11,13,15,17,18,19])
+    @pytest.mark.parametrize("stride", [-4,-3,-2,-1,1,2,3,4])
+    @pytest.mark.parametrize("astype", [np.complex64, np.complex128])
+    # test to ensure masking and strides work as intended in the AVX implementation
+    def test_array(self, arraysize, stride, astype):
+        arr = np.ones(arraysize, dtype=astype)
+        abs_true = np.ones(arraysize, dtype=arr.real.dtype)
+        assert_equal(np.abs(arr[::stride]), abs_true[::stride])
author	Matti Picus <matti.picus@gmail.com>	2020-02-05 14:04:33 +0200
committer	GitHub <noreply@github.com>	2020-02-05 14:04:33 +0200
commit	c4276e2183684c0763f61342022231481cffb329 (patch)
tree	61cbbc3291a6c0753815284e2912ffe1df9ab916 /numpy/core
parent	32ce6b8cea5b464ea9b008e2b0cc3d86615a1bd5 (diff)
parent	d5b4b721cce90adea3592c126087f1fbe489784e (diff)
download	numpy-c4276e2183684c0763f61342022231481cffb329.tar.gz