ENH: Implement SIMD versions of isnan,isinf, isfinite and signbit

NumPy has SIMD versions of float / double `isnan`, `isinf`, `isfinite`, and `signbit` for SSE2 and AVX-512. The changes here replace the SSE2 version with one that uses their universal intrinsics. This allows other architectures to have SIMD versions of the functions too.
author: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> 2022-08-23 12:35:38 -0700
committer: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> 2023-01-04 02:19:17 -0800
commit: 3725e9f3237362037095f4100979a11864cfcc04 (patch)
tree: addcfc639979feffd9dc8efd54db3b6774beb601
parent: f73d53d1a44ac841458c5be852e05021feec5dff (diff)
download: numpy-3725e9f3237362037095f4100979a11864cfcc04.tar.gz
5 files changed, 427 insertions, 277 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index bf1a05ee4..f01ac4031 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -921,7 +921,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.isnan'),
           'PyUFunc_IsFiniteTypeResolver',
-          TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
           ),
 'isnat':
     Ufunc(1, 1, None,
@@ -933,19 +933,19 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.isinf'),
           'PyUFunc_IsFiniteTypeResolver',
-          TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
           ),
 'isfinite':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.isfinite'),
           'PyUFunc_IsFiniteTypeResolver',
-          TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
           ),
 'signbit':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.signbit'),
           None,
-          TD(flts, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(flts, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
           ),
 'copysign':
     Ufunc(2, 1, None,
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 7b070a084..2b70a4b9a 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1306,6 +1306,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, , l#
  *  #C = F, , L#
+ *  #fd = 1, 1, 0#
+ *  #VCHK = 1, 1, 0#
  */
 /**begin repeat1
  * #kind = logical_and, logical_or#
@@ -1342,32 +1344,22 @@ NPY_NO_EXPORT void
     }
 }
 
+#if !@fd@
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit#
  * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
  **/
-
-/**begin repeat2
- * #ISA  = , _avx512_skx#
- * #isa  = simd, avx512_skx#
- * #CHK  = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX512_SKX)#
- **/
-
-#if @CHK@
 NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_@kind@_@isa@_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((npy_bool *)op1) = @func@(in1) != 0;
-        }
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((npy_bool *)op1) = @func@(in1) != 0;
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
-#endif
-/**end repeat2**/
 /**end repeat1**/
+#endif
 
 NPY_NO_EXPORT void
 @TYPE@_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 411d53e94..c13a6491f 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -241,7 +241,8 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
  *  #TYPE = FLOAT, DOUBLE#
  */
 /**begin repeat1
- * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal#
+ * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal,
+ *         isnan, isinf, isfinite, signbit#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -400,6 +401,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  *  #c = f, f, , l#
  *  #C = F, F, , L#
  *  #half = 1, 0, 0, 0#
+ *  #fd = 0, 1, 1, 0#
  */
 
 /**begin repeat1
@@ -428,13 +430,13 @@ NPY_NO_EXPORT void
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
  * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ * #dispatched = 1, 1, 1, 1, 0, 0, 0#
  **/
 
-/**begin repeat2
- * #ISA  = , _avx512_skx#
- **/
+#if !@fd@ || !@dispatched@
 NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
 /**end repeat2**/
 /**end repeat1**/
 
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index c4e7b8929..2096893b7 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -12,10 +12,14 @@
  * such small operations that this file covers.
 */
 #define NPY_SIMD_FORCE_128
+#include <float.h>
 #include "numpy/npy_math.h"
 #include "simd/simd.h"
 #include "loops_utils.h"
 #include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
 /**********************************************************
  ** Scalars
  **********************************************************/
@@ -80,6 +84,182 @@ NPY_FINLINE double c_square_f64(double a)
 #define c_rint_f32 npy_rintf
 #define c_rint_f64 npy_rint
 
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #FDMAX = FLT_MAX, DBL_MAX#
+ * #fd = f, #
+ * #ssfx = 32, 64#
+ */
+#if @VCHK@
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_isnan_@sfx@(npyv_@sfx@ v)
+{
+    // (v != v) >> (size - 1)
+    npyv_@sfx@ r = npyv_cvt_@sfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v));
+    return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+}
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_isinf_@sfx@(npyv_@sfx@ v)
+{
+    // (abs(v) > fltmax) >> (size - 1)
+    const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
+#if defined(NPY_HAVE_NEON)
+    npyv_@sfx@ r = vcagtq_@sfx@(v, fltmax);
+#else
+    // fabs via masking of sign bit
+    const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+    npyv_@sfx@ r = npyv_reinterpret_@sfx@_u8(npyv_andc_u8(v, signmask));
+               r = npyv_cmpgt_@sfx@(r, fltmax);
+#endif
+    return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+}
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_isfinite_@sfx@(npyv_@sfx@ v)
+{
+    // ((v & signmask) <= fltmax) >> (size-1)
+    const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
+#if defined(NPY_HAVE_NEON)
+    npyv_@sfx@ r = vcaleq_@sfx@(v, fltmax);
+#else
+    // fabs via masking of sign bit
+    const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+    npyv_@sfx@ r = npyv_reinterpret_@sfx@_u8(npyv_andc_u8(v, signmask));
+               r = npyv_cmple_@sfx@(r, fltmax);
+#endif
+    return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+}
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_signbit_@sfx@(npyv_@sfx@ v)
+{
+    return npyv_shri_u@ssfx@(v, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+}
+
+#endif // @VCHK@
+/**end repeat**/
+
+#if defined(NPY_HAVE_NEON)
+    #define PREPACK_ISFINITE 1
+    #define PREPACK_SIGNBIT  1
+
+    #if NPY_SIMD_F32
+
+    static NPY_INLINE npyv_u8
+    npyv_isfinite_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+    {
+        // F32 exponent is 8-bits, which means we can pack multiple into
+        // a single vector.  We shift out sign bit so that we're left
+        // with only exponent in high byte.  If not all bits are set,
+        // then we've got a finite number.
+        uint8x16x4_t tbl;
+        tbl.val[0] = npyv_shli_u32(v0, 1);
+        tbl.val[1] = npyv_shli_u32(v1, 1);
+        tbl.val[2] = npyv_shli_u32(v2, 1);
+        tbl.val[3] = npyv_shli_u32(v3, 1);
+
+        const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+        npyv_u8 r = vqtbl4q_u8(tbl, permute);
+
+        const npyv_u8 expmask = npyv_setall_u8(0xff);
+        r = npyv_cmpneq_u8(r, expmask);
+        r = vshrq_n_u8(r, 7);
+        return r;
+    }
+
+    static NPY_INLINE npyv_u8
+    npyv_signbit_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+    {
+        // We only need high byte for signbit, which means we can pack
+        // multiple inputs into a single vector.
+        uint8x16x4_t tbl;
+        tbl.val[0] = v0;
+        tbl.val[1] = v1;
+        tbl.val[2] = v2;
+        tbl.val[3] = v3;
+
+        const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+        npyv_u8 r = vqtbl4q_u8(tbl, permute);
+                r = vshrq_n_u8(r, 7);
+        return r;
+    }
+
+    #endif // NPY_SIMD_F32
+
+    #if NPY_SIMD_F64
+
+    static NPY_INLINE npyv_u8
+    npyv_isfinite_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                         npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+    {
+        // F64 exponent is 11-bits, which means we can pack multiple into
+        // a single vector.  We'll need to use u16 to fit all exponent
+        // bits.  If not all bits are set, then we've got a finite number.
+        uint8x16x4_t t0123, t4567;
+        t0123.val[0] = v0;
+        t0123.val[1] = v1;
+        t0123.val[2] = v2;
+        t0123.val[3] = v3;
+        t4567.val[0] = v4;
+        t4567.val[1] = v5;
+        t4567.val[2] = v6;
+        t4567.val[3] = v7;
+
+        const npyv_u8 permute = {6,7,14,15,  22,23,30,31,  38,39,46,47,  54,55,62,63};
+        npyv_u16 r0 = vqtbl4q_u8(t0123, permute);
+        npyv_u16 r1 = vqtbl4q_u8(t4567, permute);
+
+        const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
+        r0 = npyv_and_u16(r0, expmask);
+        r0 = npyv_cmpneq_u16(r0, expmask);
+        r0 = npyv_shri_u16(r0, 15);
+        r1 = npyv_and_u16(r1, expmask);
+        r1 = npyv_cmpneq_u16(r1, expmask);
+        r1 = npyv_shri_u16(r1, 15);
+
+        npyv_u8 r = npyv_pack_b8_b16(r0, r1);
+        return r;
+    }
+
+    static NPY_INLINE npyv_u8
+    npyv_signbit_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                        npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+    {
+        // We only need high byte for signbit, which means we can pack
+        // multiple inputs into a single vector.
+
+        // vuzp2 faster than vtbl for f64
+        npyv_u32 v01 = vuzp2q_u32(v0, v1);
+        npyv_u32 v23 = vuzp2q_u32(v2, v3);
+        npyv_u32 v45 = vuzp2q_u32(v4, v5);
+        npyv_u32 v67 = vuzp2q_u32(v6, v7);
+
+        npyv_u16 v0123 = vuzp2q_u16(v01, v23);
+        npyv_u16 v4567 = vuzp2q_u16(v45, v67);
+
+        npyv_u8 r = vuzp2q_u8(v0123, v4567);
+                r = vshrq_n_u8(r, 7);
+        return r;
+    }
+
+    #endif // NPY_SIMD_F64
+
+#else
+    #define PREPACK_ISFINITE 0
+    #define PREPACK_SIGNBIT  0
+#endif // defined(NPY_HAVE_NEON)
+
+#endif // NPY_SIMD
+
 /********************************************************************************
  ** Defining the SIMD kernels
  ********************************************************************************/
@@ -149,6 +329,9 @@ NPY_FINLINE double c_square_f64(double a)
  * #TYPE = FLOAT, DOUBLE#
  * #sfx  = f32, f64#
  * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #FDMAX = FLT_MAX, DBL_MAX#
+ * #fd = f, #
+ * #ssfx = 32, 64#
  */
 #if @VCHK@
 /**begin repeat1
@@ -249,10 +432,179 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
 }
 /**end repeat2**/
 /**end repeat1**/
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ * #PREPACK = 0, 0, PREPACK_ISFINITE, PREPACK_SIGNBIT#
+ */
+/**begin repeat2
+ * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
+ * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
+ * #unroll = 1, 1, 1, 1#
+ */
+static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_@sfx@ *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * @unroll@ * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        /**begin repeat3
+         * #N  = 0, 1, 2, 3#
+         */
+        #if @unroll@ > @N@
+            // Load vectors
+            #if @STYPE@ == CONTIG
+                // contiguous input
+                npyv_@sfx@ v0_@N@ = npyv_load_@sfx@(ip + vstep * (0 + PACK_FACTOR * @N@)); // 2 * (0 + 8 * n)
+                npyv_@sfx@ v1_@N@ = npyv_load_@sfx@(ip + vstep * (1 + PACK_FACTOR * @N@));
+                npyv_@sfx@ v2_@N@ = npyv_load_@sfx@(ip + vstep * (2 + PACK_FACTOR * @N@));
+                npyv_@sfx@ v3_@N@ = npyv_load_@sfx@(ip + vstep * (3 + PACK_FACTOR * @N@));
+                #if PACK_FACTOR == 8
+                npyv_@sfx@ v4_@N@ = npyv_load_@sfx@(ip + vstep * (4 + PACK_FACTOR * @N@));
+                npyv_@sfx@ v5_@N@ = npyv_load_@sfx@(ip + vstep * (5 + PACK_FACTOR * @N@));
+                npyv_@sfx@ v6_@N@ = npyv_load_@sfx@(ip + vstep * (6 + PACK_FACTOR * @N@));
+                npyv_@sfx@ v7_@N@ = npyv_load_@sfx@(ip + vstep * (7 + PACK_FACTOR * @N@)); // 2 * (7 + 8 * n) --> 32 + 7: 39 * 2: 78
+                #endif
+            #else
+                // non-contiguous input
+                npyv_@sfx@ v0_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(0 + PACK_FACTOR * @N@), istride);
+                npyv_@sfx@ v1_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(1 + PACK_FACTOR * @N@), istride);
+                npyv_@sfx@ v2_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(2 + PACK_FACTOR * @N@), istride);
+                npyv_@sfx@ v3_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(3 + PACK_FACTOR * @N@), istride);
+                #if PACK_FACTOR == 8
+                npyv_@sfx@ v4_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(4 + PACK_FACTOR * @N@), istride);
+                npyv_@sfx@ v5_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(5 + PACK_FACTOR * @N@), istride);
+                npyv_@sfx@ v6_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(6 + PACK_FACTOR * @N@), istride);
+                npyv_@sfx@ v7_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(7 + PACK_FACTOR * @N@), istride);
+                #endif
+            #endif
+        #endif // @unroll@ > @N@
+        /**end repeat3**/
+
+        /**begin repeat3
+         * #N  = 0, 1, 2, 3#
+         */
+        #if @unroll@ > @N@
+        #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+            #if @ssfx@ == 32
+            npyv_u8 r_@N@ = npyv_@kind@_4x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@);
+            #elif @ssfx@ == 64
+            npyv_u8 r_@N@ = npyv_@kind@_8x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@,
+                                                 v4_@N@, v5_@N@, v6_@N@, v7_@N@);
+            #endif
+        #else
+            npyv_u@ssfx@ r0_@N@ = npyv_@kind@_@sfx@(v0_@N@);
+            npyv_u@ssfx@ r1_@N@ = npyv_@kind@_@sfx@(v1_@N@);
+            npyv_u@ssfx@ r2_@N@ = npyv_@kind@_@sfx@(v2_@N@);
+            npyv_u@ssfx@ r3_@N@ = npyv_@kind@_@sfx@(v3_@N@);
+            #if PACK_FACTOR == 8
+            npyv_u@ssfx@ r4_@N@ = npyv_@kind@_@sfx@(v4_@N@);
+            npyv_u@ssfx@ r5_@N@ = npyv_@kind@_@sfx@(v5_@N@);
+            npyv_u@ssfx@ r6_@N@ = npyv_@kind@_@sfx@(v6_@N@);
+            npyv_u@ssfx@ r7_@N@ = npyv_@kind@_@sfx@(v7_@N@);
+            #endif // PACK_FACTOR == 8
+        #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+        #endif // @unroll@ > @N@
+        /**end repeat3**/
+
+        /**begin repeat3
+         * #N  = 0, 1, 2, 3#
+         */
+        #if @unroll@ > @N@
+        #if @DTYPE@ == CONTIG
+            #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+                // Nothing to do, results already packed
+            #else
+                // Pack results
+                #if PACK_FACTOR == 4
+                npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b32(r0_@N@, r1_@N@, r2_@N@, r3_@N@));
+                #elif PACK_FACTOR == 8
+                npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b64(r0_@N@, r1_@N@, r2_@N@, r3_@N@,
+                                                                r4_@N@, r5_@N@, r6_@N@, r7_@N@));
+                #endif // PACK_FACTOR == 8
+            #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+
+            npyv_store_u8(op + vstep * PACK_FACTOR * @N@, r_@N@);
+
+        #else // @DTYPE@ == CONTIG
+            #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+                // Results are packed, so we can just loop over them
+                npy_uint8 lane_@N@[npyv_nlanes_u8];
+                npyv_store_u8(lane_@N@, r_@N@);
+                for (int ln=0; ln<npyv_nlanes_u8; ++ln){
+                    op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)];
+                }
+            #else
+                // Results are not packed.  Use template to loop.
+                /**begin repeat4
+                 * #R = 0, 1, 2, 3, 4, 5, 6, 7#
+                 */
+                #if @R@ < PACK_FACTOR
+                npy_uint8 lane@R@_@N@[npyv_nlanes_u8];
+                npyv_store_u8(lane@R@_@N@, r@R@_@N@);
+                op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)];
+                op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)];
+                #if npyv_nlanes_@sfx@ == 4
+                op[(2 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[2 * sizeof(npyv_lanetype_@sfx@)];
+                op[(3 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[3 * sizeof(npyv_lanetype_@sfx@)];
+                #endif
+                #endif
+                /**end repeat4**/
+            #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+        #endif // @DTYPE@ == CONTIG
+        #endif // @unroll@ > @N@
+        /**end repeat3**/
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if @STYPE@ == CONTIG
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+    #else
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+    #endif
+
+        npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, r);
+
+        op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
+        op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
+        #if npyv_nlanes_@sfx@ == 4
+        op[2*ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)];
+        op[3*ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_@kind@(*ip) != 0);
+    }
+
+
+    npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+
 #endif // @VCHK@
 /**end repeat**/
 
 #undef WORKAROUND_CLANG_RECIPROCAL_BUG
+#undef PREPACK_ISFINITE
+#undef PREPACK_SIGNBIT
 
 /********************************************************************************
  ** Defining ufunc inner functions
@@ -316,4 +668,50 @@ clear:;
 #endif
 }
 /**end repeat1**/
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+#if @VCHK@
+    const int ilsize = sizeof(npyv_lanetype_@sfx@);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || (istep % ilsize == 0 && ostep % olsize == 0));
+
+    if (!is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_@sfx@(istride) &&
+        npyv_storable_stride_@sfx@(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // @VCHK@
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1;
+        *((npy_bool *)op1) = (npy_@kind@(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
 /**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 10c44ce30..1a5c0ffb8 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -89,63 +89,33 @@ run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const n
  */
 
 /**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #EXISTS = 1, 1, 0#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-static inline NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static inline int
-run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(npy_bool), 64)) {
-        AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else {
-        return 0;
-    }
-#endif
-    return 0;
-}
-
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #vector = 1, 1, 0#
  *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
  */
+
 /**begin repeat1
- * #kind = isnan, isfinite, isinf, signbit#
+ * #func = negative#
+ * #check = IS_BLOCKABLE_UNARY#
+ * #name = unary#
  */
+
 #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
 
+/* prototypes */
 static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
+sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
 
 #endif
 
 static inline int
-run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
 #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
-        npy_is_aligned(args[0], sizeof(@type@))) {
-        sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
+    if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
+        sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
         return 1;
     }
 #endif
@@ -189,139 +159,6 @@ NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
 }
 /**end repeat**/
 
-/**begin repeat
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
- *  #c = f, #
- *  #vtype = __m128, __m128d#
- *  #vtype256 = __m256, __m256d#
- *  #vtype512 = __m512, __m512d#
- *  #vpre = _mm, _mm#
- *  #vpre256 = _mm256, _mm256#
- *  #vpre512 = _mm512, _mm512#
- *  #vsuf = ps, pd#
- *  #vsufs = ss, sd#
- *  #nan = NPY_NANF, NPY_NAN#
- *  #double = 0, 1#
- *  #cast = _mm_castps_si128, _mm_castpd_si128#
- */
-/*
- * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
- * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
- * calling convention leading to C2719 on 32 bit, see #4795
- */
-NPY_FINLINE void
-sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
-                              npy_bool * op)
-{
-    const __m128i mask = @vpre@_set1_epi8(0x1);
-    __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
-    __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
-    __m128i rr = @vpre@_packs_epi16(ir1, ir2);
-#if @double@
-    rr = @vpre@_packs_epi16(rr, rr);
-    rr = @vpre@_and_si128(rr, mask);
-    @vpre@_storel_epi64((__m128i*)op, rr);
-#else
-    rr = @vpre@_and_si128(rr, mask);
-    @vpre@_storeu_si128((__m128i*)op, rr);
-#endif
-}
-
-static void
-sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = npy_signbit(ip1[i]) != 0;
-    }
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-        int r = @vpre@_movemask_@vsuf@(a);
-        if (sizeof(@type@) == 8) {
-            op[i] = r & 1;
-            op[i + 1] = (r >> 1);
-        }
-        else {
-            op[i] = r & 1;
-            op[i + 1] = (r >> 1) & 1;
-            op[i + 2] = (r >> 2) & 1;
-            op[i + 3] = (r >> 3);
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = npy_signbit(ip1[i]) != 0;
-    }
-}
-
-/**begin repeat1
- * #kind = isnan, isfinite, isinf#
- * #var = 0, 1, 2#
- */
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-#if @var@ != 0 /* isinf/isfinite */
-    /* signbit mask 0x7FFFFFFF after andnot */
-    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
-    const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
-                                             @vpre@_setzero_@vsuf@());
-#if @double@
-    const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
-#else
-    const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
-#endif
-#endif
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = npy_@kind@(ip1[i]) != 0;
-    }
-    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ r1, r2, r3, r4;
-#if @var@ != 0 /* isinf/isfinite */
-        /* fabs via masking of sign bit */
-        r1 = @vpre@_andnot_@vsuf@(mask, a);
-        r2 = @vpre@_andnot_@vsuf@(mask, b);
-        r3 = @vpre@_andnot_@vsuf@(mask, c);
-        r4 = @vpre@_andnot_@vsuf@(mask, d);
-#if @var@ == 1 /* isfinite */
-        /* negative compare against max float, nan is always true */
-        r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
-        r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
-        r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
-        r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
-#else /* isinf */
-        r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
-        r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
-        r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
-        r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
-#endif
-        /* flip results to what we want (andnot as there is no sse not) */
-        r1 = @vpre@_andnot_@vsuf@(r1, ones);
-        r2 = @vpre@_andnot_@vsuf@(r2, ones);
-        r3 = @vpre@_andnot_@vsuf@(r3, ones);
-        r4 = @vpre@_andnot_@vsuf@(r4, ones);
-#endif
-#if @var@ == 0 /* isnan */
-        r1 = @vpre@_cmpneq_@vsuf@(a, a);
-        r2 = @vpre@_cmpneq_@vsuf@(b, b);
-        r3 = @vpre@_cmpneq_@vsuf@(c, c);
-        r4 = @vpre@_cmpneq_@vsuf@(d, d);
-#endif
-        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = npy_@kind@(ip1[i]) != 0;
-    }
-}
-
-/**end repeat1**/
-/**end repeat**/
-
 /* bunch of helper functions used in ISA_exp/log_FLOAT*/
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
@@ -714,85 +551,6 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
 /**end repeat**/
 
 /**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #episize = epi32, epi64#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- * #IMM8 = 0x81, 0x99, 0x18, 0x04#
- * #is_finite = 0, 1, 0, 0#
- * #is_signbit = 0, 0, 0, 1#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static inline NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
-{
-    const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@);
-    npy_intp num_remaining_elements = array_size;
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-#if @is_signbit@
-    @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0);
-#endif
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum
-     * index will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-
-    npy_int32 index_ip[@num_lanes@];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
-        index_ip[ii] = ii*stride_ip;
-    }
-    @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
-    @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
-    __m512i ones = _mm512_set1_@episize@(1);
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1;
-        if (stride_ip == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
-        }
-        else {
-            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
-        }
-#if @is_signbit@
-        x1 = _mm512_and_@vsuffix@(x1,signbit);
-#endif
-
-        @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
-#if @is_finite@
-        fpclassmask = _mm512_knot(fpclassmask);
-#endif
-
-        __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones);
-        _mm_mask_storeu_epi8(op, load_mask, out);
-
-        ip += @num_lanes@*stride_ip;
-        op += @num_lanes@;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * #TYPE = CFLOAT, CDOUBLE#
  * #type = npy_float, npy_double#
  * #num_lanes = 16, 8#
author	Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>	2022-08-23 12:35:38 -0700
committer	Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>	2023-01-04 02:19:17 -0800
commit	3725e9f3237362037095f4100979a11864cfcc04 (patch)
tree	addcfc639979feffd9dc8efd54db3b6774beb601
parent	f73d53d1a44ac841458c5be852e05021feec5dff (diff)
download	numpy-3725e9f3237362037095f4100979a11864cfcc04.tar.gz