summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDeveloper-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>2022-08-23 12:35:38 -0700
committerDeveloper-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>2023-01-04 02:19:17 -0800
commit3725e9f3237362037095f4100979a11864cfcc04 (patch)
treeaddcfc639979feffd9dc8efd54db3b6774beb601
parentf73d53d1a44ac841458c5be852e05021feec5dff (diff)
downloadnumpy-3725e9f3237362037095f4100979a11864cfcc04.tar.gz
ENH: Implement SIMD versions of isnan,isinf, isfinite and signbit
NumPy has SIMD versions of float / double `isnan`, `isinf`, `isfinite`, and `signbit` for SSE2 and AVX-512. The changes here replace the SSE2 version with one that uses their universal intrinsics. This allows other architectures to have SIMD versions of the functions too.
-rw-r--r--numpy/core/code_generators/generate_umath.py8
-rw-r--r--numpy/core/src/umath/loops.c.src24
-rw-r--r--numpy/core/src/umath/loops.h.src12
-rw-r--r--numpy/core/src/umath/loops_unary_fp.dispatch.c.src398
-rw-r--r--numpy/core/src/umath/simd.inc.src262
5 files changed, 427 insertions, 277 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index bf1a05ee4..f01ac4031 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -921,7 +921,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.isnan'),
'PyUFunc_IsFiniteTypeResolver',
- TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+ TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
),
'isnat':
Ufunc(1, 1, None,
@@ -933,19 +933,19 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.isinf'),
'PyUFunc_IsFiniteTypeResolver',
- TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+ TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
),
'isfinite':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.isfinite'),
'PyUFunc_IsFiniteTypeResolver',
- TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+ TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
),
'signbit':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.signbit'),
None,
- TD(flts, simd=[('avx512_skx', 'fd')], out='?'),
+ TD(flts, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
),
'copysign':
Ufunc(2, 1, None,
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 7b070a084..2b70a4b9a 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1306,6 +1306,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
* #c = f, , l#
* #C = F, , L#
+ * #fd = 1, 1, 0#
+ * #VCHK = 1, 1, 0#
*/
/**begin repeat1
* #kind = logical_and, logical_or#
@@ -1342,32 +1344,22 @@ NPY_NO_EXPORT void
}
}
+#if !@fd@
/**begin repeat1
* #kind = isnan, isinf, isfinite, signbit#
* #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
**/
-
-/**begin repeat2
- * #ISA = , _avx512_skx#
- * #isa = simd, avx512_skx#
- * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX512_SKX)#
- **/
-
-#if @CHK@
NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
- if (!run_@kind@_@isa@_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *((npy_bool *)op1) = @func@(in1) != 0;
- }
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *((npy_bool *)op1) = @func@(in1) != 0;
}
npy_clear_floatstatus_barrier((char*)dimensions);
}
-#endif
-/**end repeat2**/
/**end repeat1**/
+#endif
NPY_NO_EXPORT void
@TYPE@_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 411d53e94..c13a6491f 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -241,7 +241,8 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
* #TYPE = FLOAT, DOUBLE#
*/
/**begin repeat1
- * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal#
+ * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal,
+ * isnan, isinf, isfinite, signbit#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -400,6 +401,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
* #c = f, f, , l#
* #C = F, F, , L#
* #half = 1, 0, 0, 0#
+ * #fd = 0, 1, 1, 0#
*/
/**begin repeat1
@@ -428,13 +430,13 @@ NPY_NO_EXPORT void
/**begin repeat1
* #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
* #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ * #dispatched = 1, 1, 1, 1, 0, 0, 0#
**/
-/**begin repeat2
- * #ISA = , _avx512_skx#
- **/
+#if !@fd@ || !@dispatched@
NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
/**end repeat2**/
/**end repeat1**/
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index c4e7b8929..2096893b7 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -12,10 +12,14 @@
* such small operations that this file covers.
*/
#define NPY_SIMD_FORCE_128
+#include <float.h>
#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
/**********************************************************
** Scalars
**********************************************************/
@@ -80,6 +84,182 @@ NPY_FINLINE double c_square_f64(double a)
#define c_rint_f32 npy_rintf
#define c_rint_f64 npy_rint
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #FDMAX = FLT_MAX, DBL_MAX#
+ * #fd = f, #
+ * #ssfx = 32, 64#
+ */
+#if @VCHK@
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_isnan_@sfx@(npyv_@sfx@ v)
+{
+ // (v != v) >> (size - 1)
+ npyv_@sfx@ r = npyv_cvt_@sfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v));
+ return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+}
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_isinf_@sfx@(npyv_@sfx@ v)
+{
+ // (abs(v) > fltmax) >> (size - 1)
+ const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
+#if defined(NPY_HAVE_NEON)
+ npyv_@sfx@ r = vcagtq_@sfx@(v, fltmax);
+#else
+ // fabs via masking of sign bit
+ const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+ npyv_@sfx@ r = npyv_reinterpret_@sfx@_u8(npyv_andc_u8(v, signmask));
+ r = npyv_cmpgt_@sfx@(r, fltmax);
+#endif
+ return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+}
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_isfinite_@sfx@(npyv_@sfx@ v)
+{
+ // ((v & signmask) <= fltmax) >> (size-1)
+ const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
+#if defined(NPY_HAVE_NEON)
+ npyv_@sfx@ r = vcaleq_@sfx@(v, fltmax);
+#else
+ // fabs via masking of sign bit
+ const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+ npyv_@sfx@ r = npyv_reinterpret_@sfx@_u8(npyv_andc_u8(v, signmask));
+ r = npyv_cmple_@sfx@(r, fltmax);
+#endif
+ return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+}
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_signbit_@sfx@(npyv_@sfx@ v)
+{
+ return npyv_shri_u@ssfx@(v, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+}
+
+#endif // @VCHK@
+/**end repeat**/
+
+#if defined(NPY_HAVE_NEON)
+ #define PREPACK_ISFINITE 1
+ #define PREPACK_SIGNBIT 1
+
+ #if NPY_SIMD_F32
+
+ static NPY_INLINE npyv_u8
+ npyv_isfinite_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+ {
+ // F32 exponent is 8-bits, which means we can pack multiple into
+ // a single vector. We shift out sign bit so that we're left
+ // with only exponent in high byte. If not all bits are set,
+ // then we've got a finite number.
+ uint8x16x4_t tbl;
+ tbl.val[0] = npyv_shli_u32(v0, 1);
+ tbl.val[1] = npyv_shli_u32(v1, 1);
+ tbl.val[2] = npyv_shli_u32(v2, 1);
+ tbl.val[3] = npyv_shli_u32(v3, 1);
+
+ const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
+ npyv_u8 r = vqtbl4q_u8(tbl, permute);
+
+ const npyv_u8 expmask = npyv_setall_u8(0xff);
+ r = npyv_cmpneq_u8(r, expmask);
+ r = vshrq_n_u8(r, 7);
+ return r;
+ }
+
+ static NPY_INLINE npyv_u8
+ npyv_signbit_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+ {
+ // We only need high byte for signbit, which means we can pack
+ // multiple inputs into a single vector.
+ uint8x16x4_t tbl;
+ tbl.val[0] = v0;
+ tbl.val[1] = v1;
+ tbl.val[2] = v2;
+ tbl.val[3] = v3;
+
+ const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
+ npyv_u8 r = vqtbl4q_u8(tbl, permute);
+ r = vshrq_n_u8(r, 7);
+ return r;
+ }
+
+ #endif // NPY_SIMD_F32
+
+ #if NPY_SIMD_F64
+
+ static NPY_INLINE npyv_u8
+ npyv_isfinite_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+ npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+ {
+ // F64 exponent is 11-bits, which means we can pack multiple into
+ // a single vector. We'll need to use u16 to fit all exponent
+ // bits. If not all bits are set, then we've got a finite number.
+ uint8x16x4_t t0123, t4567;
+ t0123.val[0] = v0;
+ t0123.val[1] = v1;
+ t0123.val[2] = v2;
+ t0123.val[3] = v3;
+ t4567.val[0] = v4;
+ t4567.val[1] = v5;
+ t4567.val[2] = v6;
+ t4567.val[3] = v7;
+
+ const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63};
+ npyv_u16 r0 = vqtbl4q_u8(t0123, permute);
+ npyv_u16 r1 = vqtbl4q_u8(t4567, permute);
+
+ const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
+ r0 = npyv_and_u16(r0, expmask);
+ r0 = npyv_cmpneq_u16(r0, expmask);
+ r0 = npyv_shri_u16(r0, 15);
+ r1 = npyv_and_u16(r1, expmask);
+ r1 = npyv_cmpneq_u16(r1, expmask);
+ r1 = npyv_shri_u16(r1, 15);
+
+ npyv_u8 r = npyv_pack_b8_b16(r0, r1);
+ return r;
+ }
+
+ static NPY_INLINE npyv_u8
+ npyv_signbit_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+ npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+ {
+ // We only need high byte for signbit, which means we can pack
+ // multiple inputs into a single vector.
+
+ // vuzp2 faster than vtbl for f64
+ npyv_u32 v01 = vuzp2q_u32(v0, v1);
+ npyv_u32 v23 = vuzp2q_u32(v2, v3);
+ npyv_u32 v45 = vuzp2q_u32(v4, v5);
+ npyv_u32 v67 = vuzp2q_u32(v6, v7);
+
+ npyv_u16 v0123 = vuzp2q_u16(v01, v23);
+ npyv_u16 v4567 = vuzp2q_u16(v45, v67);
+
+ npyv_u8 r = vuzp2q_u8(v0123, v4567);
+ r = vshrq_n_u8(r, 7);
+ return r;
+ }
+
+ #endif // NPY_SIMD_F64
+
+#else
+ #define PREPACK_ISFINITE 0
+ #define PREPACK_SIGNBIT 0
+#endif // defined(NPY_HAVE_NEON)
+
+#endif // NPY_SIMD
+
/********************************************************************************
** Defining the SIMD kernels
********************************************************************************/
@@ -149,6 +329,9 @@ NPY_FINLINE double c_square_f64(double a)
* #TYPE = FLOAT, DOUBLE#
* #sfx = f32, f64#
* #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #FDMAX = FLT_MAX, DBL_MAX#
+ * #fd = f, #
+ * #ssfx = 32, 64#
*/
#if @VCHK@
/**begin repeat1
@@ -249,10 +432,179 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
}
/**end repeat2**/
/**end repeat1**/
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ * #PREPACK = 0, 0, PREPACK_ISFINITE, PREPACK_SIGNBIT#
+ */
+/**begin repeat2
+ * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG#
+ * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG#
+ * #unroll = 1, 1, 1, 1#
+ */
+static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+ const npyv_lanetype_@sfx@ *ip = src;
+ npy_bool *op = dst;
+
+ // How many vectors can be packed into a u8 / bool vector?
+ #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@)
+ assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * @unroll@ * PACK_FACTOR;
+
+ // unrolled iterations
+ for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+ /**begin repeat3
+ * #N = 0, 1, 2, 3#
+ */
+ #if @unroll@ > @N@
+ // Load vectors
+ #if @STYPE@ == CONTIG
+ // contiguous input
+ npyv_@sfx@ v0_@N@ = npyv_load_@sfx@(ip + vstep * (0 + PACK_FACTOR * @N@)); // 2 * (0 + 8 * n)
+ npyv_@sfx@ v1_@N@ = npyv_load_@sfx@(ip + vstep * (1 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v2_@N@ = npyv_load_@sfx@(ip + vstep * (2 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v3_@N@ = npyv_load_@sfx@(ip + vstep * (3 + PACK_FACTOR * @N@));
+ #if PACK_FACTOR == 8
+ npyv_@sfx@ v4_@N@ = npyv_load_@sfx@(ip + vstep * (4 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v5_@N@ = npyv_load_@sfx@(ip + vstep * (5 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v6_@N@ = npyv_load_@sfx@(ip + vstep * (6 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v7_@N@ = npyv_load_@sfx@(ip + vstep * (7 + PACK_FACTOR * @N@)); // 2 * (7 + 8 * n) --> 32 + 7: 39 * 2: 78
+ #endif
+ #else
+ // non-contiguous input
+ npyv_@sfx@ v0_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(0 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v1_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(1 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v2_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(2 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v3_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(3 + PACK_FACTOR * @N@), istride);
+ #if PACK_FACTOR == 8
+ npyv_@sfx@ v4_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(4 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v5_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(5 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v6_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(6 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v7_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(7 + PACK_FACTOR * @N@), istride);
+ #endif
+ #endif
+ #endif // @unroll@ > @N@
+ /**end repeat3**/
+
+ /**begin repeat3
+ * #N = 0, 1, 2, 3#
+ */
+ #if @unroll@ > @N@
+ #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ #if @ssfx@ == 32
+ npyv_u8 r_@N@ = npyv_@kind@_4x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@);
+ #elif @ssfx@ == 64
+ npyv_u8 r_@N@ = npyv_@kind@_8x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@,
+ v4_@N@, v5_@N@, v6_@N@, v7_@N@);
+ #endif
+ #else
+ npyv_u@ssfx@ r0_@N@ = npyv_@kind@_@sfx@(v0_@N@);
+ npyv_u@ssfx@ r1_@N@ = npyv_@kind@_@sfx@(v1_@N@);
+ npyv_u@ssfx@ r2_@N@ = npyv_@kind@_@sfx@(v2_@N@);
+ npyv_u@ssfx@ r3_@N@ = npyv_@kind@_@sfx@(v3_@N@);
+ #if PACK_FACTOR == 8
+ npyv_u@ssfx@ r4_@N@ = npyv_@kind@_@sfx@(v4_@N@);
+ npyv_u@ssfx@ r5_@N@ = npyv_@kind@_@sfx@(v5_@N@);
+ npyv_u@ssfx@ r6_@N@ = npyv_@kind@_@sfx@(v6_@N@);
+ npyv_u@ssfx@ r7_@N@ = npyv_@kind@_@sfx@(v7_@N@);
+ #endif // PACK_FACTOR == 8
+ #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ #endif // @unroll@ > @N@
+ /**end repeat3**/
+
+ /**begin repeat3
+ * #N = 0, 1, 2, 3#
+ */
+ #if @unroll@ > @N@
+ #if @DTYPE@ == CONTIG
+ #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ // Nothing to do, results already packed
+ #else
+ // Pack results
+ #if PACK_FACTOR == 4
+ npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b32(r0_@N@, r1_@N@, r2_@N@, r3_@N@));
+ #elif PACK_FACTOR == 8
+ npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b64(r0_@N@, r1_@N@, r2_@N@, r3_@N@,
+ r4_@N@, r5_@N@, r6_@N@, r7_@N@));
+ #endif // PACK_FACTOR == 8
+ #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+
+ npyv_store_u8(op + vstep * PACK_FACTOR * @N@, r_@N@);
+
+ #else // @DTYPE@ == CONTIG
+ #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ // Results are packed, so we can just loop over them
+ npy_uint8 lane_@N@[npyv_nlanes_u8];
+ npyv_store_u8(lane_@N@, r_@N@);
+ for (int ln=0; ln<npyv_nlanes_u8; ++ln){
+ op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)];
+ }
+ #else
+ // Results are not packed. Use template to loop.
+ /**begin repeat4
+ * #R = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ #if @R@ < PACK_FACTOR
+ npy_uint8 lane@R@_@N@[npyv_nlanes_u8];
+ npyv_store_u8(lane@R@_@N@, r@R@_@N@);
+ op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)];
+ op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)];
+ #if npyv_nlanes_@sfx@ == 4
+ op[(2 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[2 * sizeof(npyv_lanetype_@sfx@)];
+ op[(3 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[3 * sizeof(npyv_lanetype_@sfx@)];
+ #endif
+ #endif
+ /**end repeat4**/
+ #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ #endif // @DTYPE@ == CONTIG
+ #endif // @unroll@ > @N@
+ /**end repeat3**/
+ }
+
+ // vector-sized iterations
+ for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+ #if @STYPE@ == CONTIG
+ npyv_@sfx@ v = npyv_load_@sfx@(ip);
+ #else
+ npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+ #endif
+
+ npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
+
+ npy_uint8 lane[npyv_nlanes_u8];
+ npyv_store_u8(lane, r);
+
+ op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
+ op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
+ #if npyv_nlanes_@sfx@ == 4
+ op[2*ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)];
+ op[3*ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)];
+ #endif
+ }
+
+ #undef PACK_FACTOR
+
+ // Scalar loop to finish off
+ for (; len > 0; --len, ip += istride, op += ostride) {
+ *op = (npy_@kind@(*ip) != 0);
+ }
+
+
+ npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+
#endif // @VCHK@
/**end repeat**/
#undef WORKAROUND_CLANG_RECIPROCAL_BUG
+#undef PREPACK_ISFINITE
+#undef PREPACK_SIGNBIT
/********************************************************************************
** Defining ufunc inner functions
@@ -316,4 +668,50 @@ clear:;
#endif
}
/**end repeat1**/
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ const char *ip = args[0];
+ char *op = args[1];
+ const npy_intp istep = steps[0];
+ const npy_intp ostep = steps[1];
+ npy_intp len = dimensions[0];
+#if @VCHK@
+ const int ilsize = sizeof(npyv_lanetype_@sfx@);
+ const int olsize = sizeof(npy_bool);
+ const npy_intp istride = istep / ilsize;
+ const npy_intp ostride = ostep / olsize;
+ assert(len <= 1 || (istep % ilsize == 0 && ostep % olsize == 0));
+
+ if (!is_mem_overlap(ip, istep, op, ostep, len) &&
+ npyv_loadable_stride_@sfx@(istride) &&
+ npyv_storable_stride_@sfx@(ostride))
+ {
+ if (istride == 1 && ostride == 1) {
+ simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len);
+ }
+ else if (ostride == 1) {
+ simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len);
+ }
+ else if (istride == 1) {
+ simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+ } else {
+ simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+ }
+ } else
+#endif // @VCHK@
+ {
+ UNARY_LOOP {
+ const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1;
+ *((npy_bool *)op1) = (npy_@kind@(in) != 0);
+ }
+ }
+
+ npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
/**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 10c44ce30..1a5c0ffb8 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -89,63 +89,33 @@ run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const n
*/
/**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #EXISTS = 1, 1, 0#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-static inline NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static inline int
-run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
- if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(npy_bool), 64)) {
- AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
- return 1;
- }
- else {
- return 0;
- }
-#endif
- return 0;
-}
-
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
* Float types
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
* #vector = 1, 1, 0#
* #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
*/
+
/**begin repeat1
- * #kind = isnan, isfinite, isinf, signbit#
+ * #func = negative#
+ * #check = IS_BLOCKABLE_UNARY#
+ * #name = unary#
*/
+
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
+/* prototypes */
static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
+sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
#endif
static inline int
-run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
- if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
- npy_is_aligned(args[0], sizeof(@type@))) {
- sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
+ if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
+ sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
return 1;
}
#endif
@@ -189,139 +159,6 @@ NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
}
/**end repeat**/
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #scalarf = npy_sqrtf, npy_sqrt#
- * #c = f, #
- * #vtype = __m128, __m128d#
- * #vtype256 = __m256, __m256d#
- * #vtype512 = __m512, __m512d#
- * #vpre = _mm, _mm#
- * #vpre256 = _mm256, _mm256#
- * #vpre512 = _mm512, _mm512#
- * #vsuf = ps, pd#
- * #vsufs = ss, sd#
- * #nan = NPY_NANF, NPY_NAN#
- * #double = 0, 1#
- * #cast = _mm_castps_si128, _mm_castpd_si128#
- */
-/*
- * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
- * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
- * calling convention leading to C2719 on 32 bit, see #4795
- */
-NPY_FINLINE void
-sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
- npy_bool * op)
-{
- const __m128i mask = @vpre@_set1_epi8(0x1);
- __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
- __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
- __m128i rr = @vpre@_packs_epi16(ir1, ir2);
-#if @double@
- rr = @vpre@_packs_epi16(rr, rr);
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storel_epi64((__m128i*)op, rr);
-#else
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storeu_si128((__m128i*)op, rr);
-#endif
-}
-
-static void
-sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
- op[i] = npy_signbit(ip1[i]) != 0;
- }
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- int r = @vpre@_movemask_@vsuf@(a);
- if (sizeof(@type@) == 8) {
- op[i] = r & 1;
- op[i + 1] = (r >> 1);
- }
- else {
- op[i] = r & 1;
- op[i + 1] = (r >> 1) & 1;
- op[i + 2] = (r >> 2) & 1;
- op[i + 3] = (r >> 3);
- }
- }
- LOOP_BLOCKED_END {
- op[i] = npy_signbit(ip1[i]) != 0;
- }
-}
-
-/**begin repeat1
- * #kind = isnan, isfinite, isinf#
- * #var = 0, 1, 2#
- */
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-#if @var@ != 0 /* isinf/isfinite */
- /* signbit mask 0x7FFFFFFF after andnot */
- const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
- const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
- @vpre@_setzero_@vsuf@());
-#if @double@
- const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
-#else
- const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
-#endif
-#endif
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
- op[i] = npy_@kind@(ip1[i]) != 0;
- }
- LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ r1, r2, r3, r4;
-#if @var@ != 0 /* isinf/isfinite */
- /* fabs via masking of sign bit */
- r1 = @vpre@_andnot_@vsuf@(mask, a);
- r2 = @vpre@_andnot_@vsuf@(mask, b);
- r3 = @vpre@_andnot_@vsuf@(mask, c);
- r4 = @vpre@_andnot_@vsuf@(mask, d);
-#if @var@ == 1 /* isfinite */
- /* negative compare against max float, nan is always true */
- r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
- r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
- r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
- r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
-#else /* isinf */
- r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
- r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
- r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
- r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
-#endif
- /* flip results to what we want (andnot as there is no sse not) */
- r1 = @vpre@_andnot_@vsuf@(r1, ones);
- r2 = @vpre@_andnot_@vsuf@(r2, ones);
- r3 = @vpre@_andnot_@vsuf@(r3, ones);
- r4 = @vpre@_andnot_@vsuf@(r4, ones);
-#endif
-#if @var@ == 0 /* isnan */
- r1 = @vpre@_cmpneq_@vsuf@(a, a);
- r2 = @vpre@_cmpneq_@vsuf@(b, b);
- r3 = @vpre@_cmpneq_@vsuf@(c, c);
- r4 = @vpre@_cmpneq_@vsuf@(d, d);
-#endif
- sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
- }
- LOOP_BLOCKED_END {
- op[i] = npy_@kind@(ip1[i]) != 0;
- }
-}
-
-/**end repeat1**/
-/**end repeat**/
-
/* bunch of helper functions used in ISA_exp/log_FLOAT*/
#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
@@ -714,85 +551,6 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
/**end repeat**/
/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #episize = epi32, epi64#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- * #IMM8 = 0x81, 0x99, 0x18, 0x04#
- * #is_finite = 0, 1, 0, 0#
- * #is_signbit = 0, 0, 0, 1#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static inline NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
-{
- const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@);
- npy_intp num_remaining_elements = array_size;
-
- @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-#if @is_signbit@
- @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0);
-#endif
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum
- * index will fit in an int32 as a precondition for this function via
- * IS_OUTPUT_BLOCKABLE_UNARY
- */
-
- npy_int32 index_ip[@num_lanes@];
- for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
- index_ip[ii] = ii*stride_ip;
- }
- @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
- @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
- __m512i ones = _mm512_set1_@episize@(1);
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- }
- @vtype@ x1;
- if (stride_ip == 1) {
- x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
- }
- else {
- x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
- }
-#if @is_signbit@
- x1 = _mm512_and_@vsuffix@(x1,signbit);
-#endif
-
- @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
-#if @is_finite@
- fpclassmask = _mm512_knot(fpclassmask);
-#endif
-
- __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones);
- _mm_mask_storeu_epi8(op, load_mask, out);
-
- ip += @num_lanes@*stride_ip;
- op += @num_lanes@;
- num_remaining_elements -= @num_lanes@;
- }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
* #TYPE = CFLOAT, CDOUBLE#
* #type = npy_float, npy_double#
* #num_lanes = 16, 8#