summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/code_generators/generate_umath.py8
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/src/umath/loops.h.src17
-rw-r--r--numpy/core/src/umath/loops_unary_fp.dispatch.c.src425
-rw-r--r--numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src556
-rw-r--r--numpy/core/tests/test_simd.py33
6 files changed, 577 insertions, 463 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index f01ac4031..e7e63f06f 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -921,7 +921,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.isnan'),
'PyUFunc_IsFiniteTypeResolver',
- TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
+ TD(noobj, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]),
),
'isnat':
Ufunc(1, 1, None,
@@ -933,19 +933,19 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.isinf'),
'PyUFunc_IsFiniteTypeResolver',
- TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
+ TD(noobj, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]),
),
'isfinite':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.isfinite'),
'PyUFunc_IsFiniteTypeResolver',
- TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
+ TD(noobj, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]),
),
'signbit':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.signbit'),
None,
- TD(flts, out='?', dispatch=[('loops_unary_fp', inexactvec)]),
+ TD(flts, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]),
),
'copysign':
Ufunc(2, 1, None,
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index a5bd4a056..691c53f16 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -1007,6 +1007,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'loops.c.src'),
join('src', 'umath', 'loops_unary.dispatch.c.src'),
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
+ join('src', 'umath', 'loops_unary_fp_le.dispatch.c.src'),
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
join('src', 'umath', 'loops_logical.dispatch.c.src'),
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index c13a6491f..26742946f 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -241,8 +241,21 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
* #TYPE = FLOAT, DOUBLE#
*/
/**begin repeat1
- * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal,
- * isnan, isinf, isfinite, signbit#
+ * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_unary_fp_le.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 54459d545..c4e7b8929 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -3,13 +3,8 @@
** sse2 sse41
** vsx2
** neon asimd
+ ** vx vxe
**/
-
-/**
- * We ran into lots of test failures trying to enable this file for
- * VSE and VE on s390x (qemu) so avoiding these targets for now.
-*/
-
/**
* Force use SSE only on x86, even if AVX2 or AVX512F are enabled
* through the baseline, since scatter(AVX512F) and gather very costly
@@ -17,17 +12,10 @@
* such small operations that this file covers.
*/
#define NPY_SIMD_FORCE_128
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#include <float.h>
#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
-#include "lowlevel_strided_loops.h"
-// Provides the various *_LOOP macros
-#include "fast_loop_macros.h"
/**********************************************************
** Scalars
**********************************************************/
@@ -92,198 +80,6 @@ NPY_FINLINE double c_square_f64(double a)
#define c_rint_f32 npy_rintf
#define c_rint_f64 npy_rint
-/*******************************************************************************
- ** extra SIMD intrinsics
- ******************************************************************************/
-
-#if NPY_SIMD
-/**begin repeat
- * #TYPE = FLOAT, DOUBLE#
- * #sfx = f32, f64#
- * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
- * #FDMAX = FLT_MAX, DBL_MAX#
- * #fd = f, #
- * #ssfx = 32, 64#
- */
-#if @VCHK@
-
-static NPY_INLINE npyv_u@ssfx@
-npyv_isnan_@sfx@(npyv_@sfx@ v)
-{
- // (v != v) >> (size - 1)
-#if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41)
- // sse npyv_cmpneq_@sfx@ define includes a cast already
- npyv_u@ssfx@ r = npyv_cmpneq_@sfx@(v, v);
-#else
- npyv_u@ssfx@ r = npyv_cvt_u@ssfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v));
-#endif
- return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
-}
-
-static NPY_INLINE npyv_u@ssfx@
-npyv_isinf_@sfx@(npyv_@sfx@ v)
-{
- // (abs(v) > fltmax) >> (size - 1)
- const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
-#if defined(NPY_HAVE_NEON)
- npyv_u@ssfx@ r = vcagtq_@sfx@(v, fltmax);
-#else
- // fabs via masking of sign bit
- const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
- npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask));
- #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41)
- // return cast already done in npyv_cmpgt_@sfx@
- npyv_u@ssfx@ r = npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax);
- #else
- npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax));
- #endif
-#endif
- return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
-}
-
-static NPY_INLINE npyv_u@ssfx@
-npyv_isfinite_@sfx@(npyv_@sfx@ v)
-{
- // ((v & signmask) <= fltmax) >> (size-1)
- const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
-#if defined(NPY_HAVE_NEON)
- npyv_u@ssfx@ r = vcaleq_@sfx@(v, fltmax);
-#else
- // fabs via masking of sign bit
- const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
- npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask));
- #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41)
- // return cast already done in npyv_cmpgt_@sfx@
- npyv_u@ssfx@ r = npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax);
- #else
- npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax));
- #endif
-#endif
- return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
-}
-
-static NPY_INLINE npyv_u@ssfx@
-npyv_signbit_@sfx@(npyv_@sfx@ v)
-{
- return npyv_shri_u@ssfx@(npyv_reinterpret_u@ssfx@_@sfx@(v), (sizeof(npyv_lanetype_@sfx@)*8)-1);
-}
-
-#endif // @VCHK@
-/**end repeat**/
-
-// In these functions we use vqtbl4q_u8 which is only available on aarch64
-#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
- #define PREPACK_ISFINITE 1
- #define PREPACK_SIGNBIT 1
-
- #if NPY_SIMD_F32
-
- static NPY_INLINE npyv_u8
- npyv_isfinite_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
- {
- // F32 exponent is 8-bits, which means we can pack multiple into
- // a single vector. We shift out sign bit so that we're left
- // with only exponent in high byte. If not all bits are set,
- // then we've got a finite number.
- uint8x16x4_t tbl;
- tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1));
- tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1));
- tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1));
- tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1));
-
- const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
- npyv_u8 r = vqtbl4q_u8(tbl, permute);
-
- const npyv_u8 expmask = npyv_setall_u8(0xff);
- r = npyv_cmpneq_u8(r, expmask);
- r = vshrq_n_u8(r, 7);
- return r;
- }
-
- static NPY_INLINE npyv_u8
- npyv_signbit_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
- {
- // We only need high byte for signbit, which means we can pack
- // multiple inputs into a single vector.
- uint8x16x4_t tbl;
- tbl.val[0] = npyv_reinterpret_u8_f32(v0);
- tbl.val[1] = npyv_reinterpret_u8_f32(v1);
- tbl.val[2] = npyv_reinterpret_u8_f32(v2);
- tbl.val[3] = npyv_reinterpret_u8_f32(v3);
-
- const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
- npyv_u8 r = vqtbl4q_u8(tbl, permute);
- r = vshrq_n_u8(r, 7);
- return r;
- }
-
- #endif // NPY_SIMD_F32
-
- #if NPY_SIMD_F64
-
- static NPY_INLINE npyv_u8
- npyv_isfinite_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
- npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
- {
- // F64 exponent is 11-bits, which means we can pack multiple into
- // a single vector. We'll need to use u16 to fit all exponent
- // bits. If not all bits are set, then we've got a finite number.
- uint8x16x4_t t0123, t4567;
- t0123.val[0] = npyv_reinterpret_u8_f64(v0);
- t0123.val[1] = npyv_reinterpret_u8_f64(v1);
- t0123.val[2] = npyv_reinterpret_u8_f64(v2);
- t0123.val[3] = npyv_reinterpret_u8_f64(v3);
- t4567.val[0] = npyv_reinterpret_u8_f64(v4);
- t4567.val[1] = npyv_reinterpret_u8_f64(v5);
- t4567.val[2] = npyv_reinterpret_u8_f64(v6);
- t4567.val[3] = npyv_reinterpret_u8_f64(v7);
-
- const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63};
- npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute));
- npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute));
-
- const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
- r0 = npyv_and_u16(r0, expmask);
- r0 = npyv_cmpneq_u16(r0, expmask);
- r0 = npyv_shri_u16(r0, 15);
- r1 = npyv_and_u16(r1, expmask);
- r1 = npyv_cmpneq_u16(r1, expmask);
- r1 = npyv_shri_u16(r1, 15);
-
- npyv_u8 r = npyv_pack_b8_b16(r0, r1);
- return r;
- }
-
- static NPY_INLINE npyv_u8
- npyv_signbit_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
- npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
- {
- // We only need high byte for signbit, which means we can pack
- // multiple inputs into a single vector.
-
- // vuzp2 faster than vtbl for f64
- npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1));
- npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3));
- npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5));
- npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7));
-
- npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23));
- npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67));
-
- npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567));
- r = vshrq_n_u8(r, 7);
- return r;
- }
-
- #endif // NPY_SIMD_F64
-
-#else
- #define PREPACK_ISFINITE 0
- #define PREPACK_SIGNBIT 0
-#endif // defined(NPY_HAVE_NEON) && defined(__aarch64__)
-
-#endif // NPY_SIMD
-
/********************************************************************************
** Defining the SIMD kernels
********************************************************************************/
@@ -353,9 +149,6 @@ npyv_signbit_@sfx@(npyv_@sfx@ v)
* #TYPE = FLOAT, DOUBLE#
* #sfx = f32, f64#
* #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
- * #FDMAX = FLT_MAX, DBL_MAX#
- * #fd = f, #
- * #ssfx = 32, 64#
*/
#if @VCHK@
/**begin repeat1
@@ -456,179 +249,10 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
}
/**end repeat2**/
/**end repeat1**/
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite, signbit#
- * #PREPACK = 0, 0, PREPACK_ISFINITE, PREPACK_SIGNBIT#
- */
-/**begin repeat2
- * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG#
- * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG#
- * #unroll = 1, 1, 1, 1#
- */
-static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
-(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
-{
- const npyv_lanetype_@sfx@ *ip = src;
- npy_bool *op = dst;
-
- // How many vectors can be packed into a u8 / bool vector?
- #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@)
- assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
-
- const int vstep = npyv_nlanes_@sfx@;
- const int wstep = vstep * @unroll@ * PACK_FACTOR;
-
- // unrolled iterations
- for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- /**begin repeat3
- * #N = 0, 1, 2, 3#
- */
- #if @unroll@ > @N@
- // Load vectors
- #if @STYPE@ == CONTIG
- // contiguous input
- npyv_@sfx@ v0_@N@ = npyv_load_@sfx@(ip + vstep * (0 + PACK_FACTOR * @N@)); // 2 * (0 + 8 * n)
- npyv_@sfx@ v1_@N@ = npyv_load_@sfx@(ip + vstep * (1 + PACK_FACTOR * @N@));
- npyv_@sfx@ v2_@N@ = npyv_load_@sfx@(ip + vstep * (2 + PACK_FACTOR * @N@));
- npyv_@sfx@ v3_@N@ = npyv_load_@sfx@(ip + vstep * (3 + PACK_FACTOR * @N@));
- #if PACK_FACTOR == 8
- npyv_@sfx@ v4_@N@ = npyv_load_@sfx@(ip + vstep * (4 + PACK_FACTOR * @N@));
- npyv_@sfx@ v5_@N@ = npyv_load_@sfx@(ip + vstep * (5 + PACK_FACTOR * @N@));
- npyv_@sfx@ v6_@N@ = npyv_load_@sfx@(ip + vstep * (6 + PACK_FACTOR * @N@));
- npyv_@sfx@ v7_@N@ = npyv_load_@sfx@(ip + vstep * (7 + PACK_FACTOR * @N@)); // 2 * (7 + 8 * n) --> 32 + 7: 39 * 2: 78
- #endif
- #else
- // non-contiguous input
- npyv_@sfx@ v0_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(0 + PACK_FACTOR * @N@), istride);
- npyv_@sfx@ v1_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(1 + PACK_FACTOR * @N@), istride);
- npyv_@sfx@ v2_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(2 + PACK_FACTOR * @N@), istride);
- npyv_@sfx@ v3_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(3 + PACK_FACTOR * @N@), istride);
- #if PACK_FACTOR == 8
- npyv_@sfx@ v4_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(4 + PACK_FACTOR * @N@), istride);
- npyv_@sfx@ v5_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(5 + PACK_FACTOR * @N@), istride);
- npyv_@sfx@ v6_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(6 + PACK_FACTOR * @N@), istride);
- npyv_@sfx@ v7_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(7 + PACK_FACTOR * @N@), istride);
- #endif
- #endif
- #endif // @unroll@ > @N@
- /**end repeat3**/
-
- /**begin repeat3
- * #N = 0, 1, 2, 3#
- */
- #if @unroll@ > @N@
- #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
- #if @ssfx@ == 32
- npyv_u8 r_@N@ = npyv_@kind@_4x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@);
- #elif @ssfx@ == 64
- npyv_u8 r_@N@ = npyv_@kind@_8x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@,
- v4_@N@, v5_@N@, v6_@N@, v7_@N@);
- #endif
- #else
- npyv_b@ssfx@ r0_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v0_@N@));
- npyv_b@ssfx@ r1_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v1_@N@));
- npyv_b@ssfx@ r2_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v2_@N@));
- npyv_b@ssfx@ r3_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v3_@N@));
- #if PACK_FACTOR == 8
- npyv_b@ssfx@ r4_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v4_@N@));
- npyv_b@ssfx@ r5_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v5_@N@));
- npyv_b@ssfx@ r6_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v6_@N@));
- npyv_b@ssfx@ r7_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v7_@N@));
- #endif // PACK_FACTOR == 8
- #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
- #endif // @unroll@ > @N@
- /**end repeat3**/
-
- /**begin repeat3
- * #N = 0, 1, 2, 3#
- */
- #if @unroll@ > @N@
- #if @DTYPE@ == CONTIG
- #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
- // Nothing to do, results already packed
- #else
- // Pack results
- #if PACK_FACTOR == 4
- npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b32(r0_@N@, r1_@N@, r2_@N@, r3_@N@));
- #elif PACK_FACTOR == 8
- npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b64(r0_@N@, r1_@N@, r2_@N@, r3_@N@,
- r4_@N@, r5_@N@, r6_@N@, r7_@N@));
- #endif // PACK_FACTOR == 8
- #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
-
- npyv_store_u8(op + vstep * PACK_FACTOR * @N@, r_@N@);
-
- #else // @DTYPE@ == CONTIG
- #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
- // Results are packed, so we can just loop over them
- npy_uint8 lane_@N@[npyv_nlanes_u8];
- npyv_store_u8(lane_@N@, r_@N@);
- for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){
- op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)];
- }
- #else
- // Results are not packed. Use template to loop.
- /**begin repeat4
- * #R = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- #if @R@ < PACK_FACTOR
- npy_uint8 lane@R@_@N@[npyv_nlanes_u8];
- npyv_store_u8(lane@R@_@N@, npyv_reinterpret_u8_u@ssfx@(r@R@_@N@));
- op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)];
- op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)];
- #if npyv_nlanes_@sfx@ == 4
- op[(2 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[2 * sizeof(npyv_lanetype_@sfx@)];
- op[(3 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[3 * sizeof(npyv_lanetype_@sfx@)];
- #endif
- #endif
- /**end repeat4**/
- #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
- #endif // @DTYPE@ == CONTIG
- #endif // @unroll@ > @N@
- /**end repeat3**/
- }
-
- // vector-sized iterations
- for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
- #if @STYPE@ == CONTIG
- npyv_@sfx@ v = npyv_load_@sfx@(ip);
- #else
- npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
- #endif
-
- npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
-
- npy_uint8 lane[npyv_nlanes_u8];
- npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r));
-
- op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
- op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
- #if npyv_nlanes_@sfx@ == 4
- op[2*ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)];
- op[3*ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)];
- #endif
- }
-
- #undef PACK_FACTOR
-
- // Scalar loop to finish off
- for (; len > 0; --len, ip += istride, op += ostride) {
- *op = (npy_@kind@(*ip) != 0);
- }
-
-
- npyv_cleanup();
-}
-/**end repeat2**/
-/**end repeat1**/
-
#endif // @VCHK@
/**end repeat**/
#undef WORKAROUND_CLANG_RECIPROCAL_BUG
-#undef PREPACK_ISFINITE
-#undef PREPACK_SIGNBIT
/********************************************************************************
** Defining ufunc inner functions
@@ -692,51 +316,4 @@ clear:;
#endif
}
/**end repeat1**/
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite, signbit#
- **/
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-#if @VCHK@
- const char *ip = args[0];
- char *op = args[1];
- const npy_intp istep = steps[0];
- const npy_intp ostep = steps[1];
- npy_intp len = dimensions[0];
- const int ilsize = sizeof(npyv_lanetype_@sfx@);
- const int olsize = sizeof(npy_bool);
- const npy_intp istride = istep / ilsize;
- const npy_intp ostride = ostep / olsize;
- assert(len <= 1 || ostep % olsize == 0);
-
- if ((istep % ilsize == 0) &&
- !is_mem_overlap(ip, istep, op, ostep, len) &&
- npyv_loadable_stride_@sfx@(istride) &&
- npyv_storable_stride_@sfx@(ostride))
- {
- if (istride == 1 && ostride == 1) {
- simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len);
- }
- else if (ostride == 1) {
- simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len);
- }
- else if (istride == 1) {
- simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len);
- } else {
- simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
- }
- } else
-#endif // @VCHK@
- {
- UNARY_LOOP {
- const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1;
- *((npy_bool *)op1) = (npy_@kind@(in) != 0);
- }
- }
-
- npy_clear_floatstatus_barrier((char*)dimensions);
-}
-/**end repeat1**/
/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
new file mode 100644
index 000000000..fed8213df
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
@@ -0,0 +1,556 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41
+ ** vsx2
+ ** neon asimd
+ **/
+
+/**
+ * This code should really be merged into loops_unary_fp.dispatch.c.src
+ * However there is an issue with enabling the code here for VX and VXE
+ * as the shifts don't behave as expected.
+ * See the code below that references NPY__CPU_TARGET_VX and
+ * NPY_BIG_ENDIAN. Suspect that this is a big endian vector issue.
+ *
+ * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src
+ * building for VX and VXE so we don't regress performance while adding this
+ * code for other platforms.
+*/
+
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+*/
+#define NPY_SIMD_FORCE_128
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include <float.h>
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #FDMAX = FLT_MAX, DBL_MAX#
+ * #fd = f, #
+ * #ssfx = 32, 64#
+ */
+#if @VCHK@
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_isnan_@sfx@(npyv_@sfx@ v)
+{
+ // (v != v) >> (size - 1)
+#if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41)
+ // sse npyv_cmpneq_@sfx@ define includes a cast already
+ npyv_u@ssfx@ r = npyv_cmpneq_@sfx@(v, v);
+#else
+ npyv_u@ssfx@ r = npyv_cvt_u@ssfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v));
+#endif
+#if defined(NPY__CPU_TARGET_VX) || defined(NPY__CPU_TARGET_VXE)
+ // don't do the shift on s390x
+ return r;
+#else
+ return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+#endif
+}
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_isinf_@sfx@(npyv_@sfx@ v)
+{
+ // (abs(v) > fltmax) >> (size - 1)
+ const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
+#if defined(NPY_HAVE_NEON)
+ npyv_u@ssfx@ r = vcagtq_@sfx@(v, fltmax);
+#else
+ // fabs via masking of sign bit
+ const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+ npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask));
+ #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41)
+ // return cast already done in npyv_cmpgt_@sfx@
+ npyv_u@ssfx@ r = npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax);
+ #else
+ npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax));
+ #endif
+#endif
+#if defined(NPY__CPU_TARGET_VX) || defined(NPY__CPU_TARGET_VXE)
+ // don't do the shift on s390x
+ return r;
+#else
+ return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+#endif
+}
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_isfinite_@sfx@(npyv_@sfx@ v)
+{
+ // ((v & signmask) <= fltmax) >> (size-1)
+ const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
+#if defined(NPY_HAVE_NEON)
+ npyv_u@ssfx@ r = vcaleq_@sfx@(v, fltmax);
+#else
+ // fabs via masking of sign bit
+ const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+ npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask));
+ #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41)
+ // return cast already done in npyv_cmpgt_@sfx@
+ npyv_u@ssfx@ r = npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax);
+ #else
+ npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax));
+ #endif
+#endif
+#if defined(NPY__CPU_TARGET_VX) || defined(NPY__CPU_TARGET_VXE)
+ // don't do the shift on s390x
+ return r;
+#else
+ return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
+#endif
+}
+
+static NPY_INLINE npyv_u@ssfx@
+npyv_signbit_@sfx@(npyv_@sfx@ v)
+{
+#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN
+ // via masking of sign bit
+ const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+ npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_and_@sfx@(v, signmask));
+ return r;
+#else
+ return npyv_shri_u@ssfx@(npyv_reinterpret_u@ssfx@_@sfx@(v), (sizeof(npyv_lanetype_@sfx@)*8)-1);
+#endif
+}
+
+#endif // @VCHK@
+/**end repeat**/
+
+// In these functions we use vqtbl4q_u8 which is only available on aarch64
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+ #define PREPACK_ISFINITE 1
+ #define PREPACK_SIGNBIT 1
+
+ #if NPY_SIMD_F32
+
+ static NPY_INLINE npyv_u8
+ npyv_isfinite_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+ {
+ // F32 exponent is 8-bits, which means we can pack multiple into
+ // a single vector. We shift out sign bit so that we're left
+ // with only exponent in high byte. If not all bits are set,
+ // then we've got a finite number.
+ uint8x16x4_t tbl;
+ tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1));
+ tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1));
+ tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1));
+ tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1));
+
+ const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
+ npyv_u8 r = vqtbl4q_u8(tbl, permute);
+
+ const npyv_u8 expmask = npyv_setall_u8(0xff);
+ r = npyv_cmpneq_u8(r, expmask);
+ r = vshrq_n_u8(r, 7);
+ return r;
+ }
+
+ static NPY_INLINE npyv_u8
+ npyv_signbit_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+ {
+ // We only need high byte for signbit, which means we can pack
+ // multiple inputs into a single vector.
+ uint8x16x4_t tbl;
+ tbl.val[0] = npyv_reinterpret_u8_f32(v0);
+ tbl.val[1] = npyv_reinterpret_u8_f32(v1);
+ tbl.val[2] = npyv_reinterpret_u8_f32(v2);
+ tbl.val[3] = npyv_reinterpret_u8_f32(v3);
+
+ const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
+ npyv_u8 r = vqtbl4q_u8(tbl, permute);
+ r = vshrq_n_u8(r, 7);
+ return r;
+ }
+
+ #endif // NPY_SIMD_F32
+
+ #if NPY_SIMD_F64
+
+ static NPY_INLINE npyv_u8
+ npyv_isfinite_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+ npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+ {
+ // F64 exponent is 11-bits, which means we can pack multiple into
+ // a single vector. We'll need to use u16 to fit all exponent
+ // bits. If not all bits are set, then we've got a finite number.
+ uint8x16x4_t t0123, t4567;
+ t0123.val[0] = npyv_reinterpret_u8_f64(v0);
+ t0123.val[1] = npyv_reinterpret_u8_f64(v1);
+ t0123.val[2] = npyv_reinterpret_u8_f64(v2);
+ t0123.val[3] = npyv_reinterpret_u8_f64(v3);
+ t4567.val[0] = npyv_reinterpret_u8_f64(v4);
+ t4567.val[1] = npyv_reinterpret_u8_f64(v5);
+ t4567.val[2] = npyv_reinterpret_u8_f64(v6);
+ t4567.val[3] = npyv_reinterpret_u8_f64(v7);
+
+ const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63};
+ npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute));
+ npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute));
+
+ const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
+ r0 = npyv_and_u16(r0, expmask);
+ r0 = npyv_cmpneq_u16(r0, expmask);
+ r0 = npyv_shri_u16(r0, 15);
+ r1 = npyv_and_u16(r1, expmask);
+ r1 = npyv_cmpneq_u16(r1, expmask);
+ r1 = npyv_shri_u16(r1, 15);
+
+ npyv_u8 r = npyv_pack_b8_b16(r0, r1);
+ return r;
+ }
+
+ static NPY_INLINE npyv_u8
+ npyv_signbit_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+ npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+ {
+ // We only need high byte for signbit, which means we can pack
+ // multiple inputs into a single vector.
+
+ // vuzp2 faster than vtbl for f64
+ npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1));
+ npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3));
+ npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5));
+ npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7));
+
+ npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23));
+ npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67));
+
+ npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567));
+ r = vshrq_n_u8(r, 7);
+ return r;
+ }
+
+ #endif // NPY_SIMD_F64
+
+#else
+ #define PREPACK_ISFINITE 0
+ #define PREPACK_SIGNBIT 0
+#endif // defined(NPY_HAVE_NEON) && defined(__aarch64__)
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ * for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ * to fill the remind lanes with 1.0 to avoid divide by zero fp
+ * exception in reciprocal.
+ */
+#define CONTIG 0
+#define NCONTIG 1
+
+/*
+ * clang has a bug that's present at -O1 or greater. When partially loading a
+ * vector register for a reciprocal operation, the remaining elements are set
+ * to 1 to avoid divide-by-zero. The partial load is paired with a partial
+ * store after the reciprocal operation. clang notices that the entire register
+ * is not needed for the store and optimizes out the fill of 1 to the remaining
+ * elements. This causes either a divide-by-zero or 0/0 with invalid exception
+ * that we were trying to avoid by filling.
+ *
+ * Using a dummy variable marked 'volatile' convinces clang not to ignore
+ * the explicit fill of remaining elements. If `-ftrapping-math` is
+ * supported, then it'll also avoid the bug. `-ftrapping-math` is supported
+ * on Apple clang v12+ for x86_64. It is not currently supported for arm64.
+ * `-ftrapping-math` is set by default of Numpy builds in
+ * numpy/distutils/ccompiler.py.
+ *
+ * Note: Apple clang and clang upstream have different versions that overlap
+ */
+#if defined(__clang__)
+ #if defined(__apple_build_version__)
+ // Apple Clang
+ #if __apple_build_version__ < 12000000
+ // Apple Clang before v12
+ #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+ #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+ // Apple Clang after v12, targeting i386 or x86_64
+ #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
+ #else
+ // Apple Clang after v12, not targeting i386 or x86_64
+ #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+ #endif
+ #else
+ // Clang, not Apple Clang
+ #if __clang_major__ < 10
+ // Clang before v10
+ #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+ #elif defined(_MSC_VER)
+ // clang-cl has the same bug
+ #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+ #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+ // Clang v10+, targeting i386 or x86_64
+ #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
+ #else
+ // Clang v10+, not targeting i386 or x86_64
+ #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+ #endif
+ #endif
+#else
+// Not a Clang compiler
+#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
+#endif
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #FDMAX = FLT_MAX, DBL_MAX#
+ * #fd = f, #
+ * #ssfx = 32, 64#
+ */
+#if @VCHK@
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ * #PREPACK = 0, 0, PREPACK_ISFINITE, PREPACK_SIGNBIT#
+ */
+/**begin repeat2
+ * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG#
+ * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG#
+ * #unroll = 1, 1, 1, 1#
+ */
+static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+ const npyv_lanetype_@sfx@ *ip = src;
+ npy_bool *op = dst;
+
+ // How many vectors can be packed into a u8 / bool vector?
+ #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@)
+ assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * @unroll@ * PACK_FACTOR;
+
+ // unrolled iterations
+ for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+ /**begin repeat3
+ * #N = 0, 1, 2, 3#
+ */
+ #if @unroll@ > @N@
+ // Load vectors
+ #if @STYPE@ == CONTIG
+ // contiguous input
+ npyv_@sfx@ v0_@N@ = npyv_load_@sfx@(ip + vstep * (0 + PACK_FACTOR * @N@)); // 2 * (0 + 8 * n)
+ npyv_@sfx@ v1_@N@ = npyv_load_@sfx@(ip + vstep * (1 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v2_@N@ = npyv_load_@sfx@(ip + vstep * (2 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v3_@N@ = npyv_load_@sfx@(ip + vstep * (3 + PACK_FACTOR * @N@));
+ #if PACK_FACTOR == 8
+ npyv_@sfx@ v4_@N@ = npyv_load_@sfx@(ip + vstep * (4 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v5_@N@ = npyv_load_@sfx@(ip + vstep * (5 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v6_@N@ = npyv_load_@sfx@(ip + vstep * (6 + PACK_FACTOR * @N@));
+ npyv_@sfx@ v7_@N@ = npyv_load_@sfx@(ip + vstep * (7 + PACK_FACTOR * @N@)); // 2 * (7 + 8 * n) --> 32 + 7: 39 * 2: 78
+ #endif
+ #else
+ // non-contiguous input
+ npyv_@sfx@ v0_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(0 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v1_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(1 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v2_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(2 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v3_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(3 + PACK_FACTOR * @N@), istride);
+ #if PACK_FACTOR == 8
+ npyv_@sfx@ v4_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(4 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v5_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(5 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v6_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(6 + PACK_FACTOR * @N@), istride);
+ npyv_@sfx@ v7_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(7 + PACK_FACTOR * @N@), istride);
+ #endif
+ #endif
+ #endif // @unroll@ > @N@
+ /**end repeat3**/
+
+ /**begin repeat3
+ * #N = 0, 1, 2, 3#
+ */
+ #if @unroll@ > @N@
+ #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ #if @ssfx@ == 32
+ npyv_u8 r_@N@ = npyv_@kind@_4x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@);
+ #elif @ssfx@ == 64
+ npyv_u8 r_@N@ = npyv_@kind@_8x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@,
+ v4_@N@, v5_@N@, v6_@N@, v7_@N@);
+ #endif
+ #else
+ npyv_b@ssfx@ r0_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v0_@N@));
+ npyv_b@ssfx@ r1_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v1_@N@));
+ npyv_b@ssfx@ r2_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v2_@N@));
+ npyv_b@ssfx@ r3_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v3_@N@));
+ #if PACK_FACTOR == 8
+ npyv_b@ssfx@ r4_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v4_@N@));
+ npyv_b@ssfx@ r5_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v5_@N@));
+ npyv_b@ssfx@ r6_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v6_@N@));
+ npyv_b@ssfx@ r7_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v7_@N@));
+ #endif // PACK_FACTOR == 8
+ #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ #endif // @unroll@ > @N@
+ /**end repeat3**/
+
+ /**begin repeat3
+ * #N = 0, 1, 2, 3#
+ */
+ #if @unroll@ > @N@
+ #if @DTYPE@ == CONTIG
+ #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ // Nothing to do, results already packed
+ #else
+ // Pack results
+ #if PACK_FACTOR == 4
+ npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b32(r0_@N@, r1_@N@, r2_@N@, r3_@N@));
+ #elif PACK_FACTOR == 8
+ npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b64(r0_@N@, r1_@N@, r2_@N@, r3_@N@,
+ r4_@N@, r5_@N@, r6_@N@, r7_@N@));
+ #endif // PACK_FACTOR == 8
+ #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+
+ npyv_store_u8(op + vstep * PACK_FACTOR * @N@, r_@N@);
+
+ #else // @DTYPE@ == CONTIG
+ #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ // Results are packed, so we can just loop over them
+ npy_uint8 lane_@N@[npyv_nlanes_u8];
+ npyv_store_u8(lane_@N@, r_@N@);
+ for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){
+ op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)];
+ }
+ #else
+ // Results are not packed. Use template to loop.
+ /**begin repeat4
+ * #R = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ #if @R@ < PACK_FACTOR
+ npy_uint8 lane@R@_@N@[npyv_nlanes_u8];
+ npyv_store_u8(lane@R@_@N@, npyv_reinterpret_u8_u@ssfx@(r@R@_@N@));
+ op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)];
+ op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)];
+ #if npyv_nlanes_@sfx@ == 4
+ op[(2 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[2 * sizeof(npyv_lanetype_@sfx@)];
+ op[(3 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[3 * sizeof(npyv_lanetype_@sfx@)];
+ #endif
+ #endif
+ /**end repeat4**/
+ #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64)
+ #endif // @DTYPE@ == CONTIG
+ #endif // @unroll@ > @N@
+ /**end repeat3**/
+ }
+
+ // vector-sized iterations
+ for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+ #if @STYPE@ == CONTIG
+ npyv_@sfx@ v = npyv_load_@sfx@(ip);
+ #else
+ npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+ #endif
+
+ npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
+
+ npy_uint8 lane[npyv_nlanes_u8];
+ npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r));
+
+ op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
+ op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
+ #if npyv_nlanes_@sfx@ == 4
+ op[2*ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)];
+ op[3*ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)];
+ #endif
+ }
+
+ #undef PACK_FACTOR
+
+ // Scalar loop to finish off
+ for (; len > 0; --len, ip += istride, op += ostride) {
+ *op = (npy_@kind@(*ip) != 0);
+ }
+
+
+ npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+
+#endif // @VCHK@
+/**end repeat**/
+
+#undef WORKAROUND_CLANG_RECIPROCAL_BUG
+#undef PREPACK_ISFINITE
+#undef PREPACK_SIGNBIT
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ */
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VCHK@
+ const char *ip = args[0];
+ char *op = args[1];
+ const npy_intp istep = steps[0];
+ const npy_intp ostep = steps[1];
+ npy_intp len = dimensions[0];
+ const int ilsize = sizeof(npyv_lanetype_@sfx@);
+ const int olsize = sizeof(npy_bool);
+ const npy_intp istride = istep / ilsize;
+ const npy_intp ostride = ostep / olsize;
+ assert(len <= 1 || ostep % olsize == 0);
+
+ if ((istep % ilsize == 0) &&
+ !is_mem_overlap(ip, istep, op, ostep, len) &&
+ npyv_loadable_stride_@sfx@(istride) &&
+ npyv_storable_stride_@sfx@(ostride))
+ {
+ if (istride == 1 && ostride == 1) {
+ simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len);
+ }
+ else if (ostride == 1) {
+ simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len);
+ }
+ else if (istride == 1) {
+ simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+ } else {
+ simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+ }
+ } else
+#endif // @VCHK@
+ {
+ UNARY_LOOP {
+ const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1;
+ *((npy_bool *)op1) = (npy_@kind@(in) != 0);
+ }
+ }
+
+ npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 35bf90694..2c16243db 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -425,39 +425,6 @@ class _SIMD_FP(_Test_Utility):
square = self.square(vdata)
assert square == data_square
- def test_isfinite_isinf_isnan_signbit(self):
- pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
- assert np.isnan(nan) == True
- assert np.isfinite(ninf) == False
- assert np.isfinite(pinf) == False
- assert np.isinf(ninf) == True
- assert np.isinf(pinf) == True
- assert np.signbit(pinf) == False
- assert np.signbit(ninf) == True
-
- def test_array_isfinite_isinf_isnan_signbit(self):
- size = 12
- for t in [np.float32, np.float64]:
- arr = np.random.default_rng().random(size, t)
- assert np.isnan(arr)[2] == False
- assert np.isfinite(arr)[4] == True
- assert np.isinf(arr)[6] == False
- assert np.signbit(arr)[8] == False
-
- for t in [np.uint8, np.uint16, np.uint32, np.uint64]:
- arr = np.random.default_rng().integers(0,100,size,dtype=t)
- assert np.isnan(arr)[2] == False
- assert np.isfinite(arr)[4] == True
- assert np.isinf(arr)[6] == False
- assert np.signbit(arr)[8] == False
-
- for t in [np.int8, np.int16, np.int32, np.int64]:
- arr = np.random.default_rng().integers(-100,0,size,dtype=t)
- assert np.isnan(arr)[2] == False
- assert np.isfinite(arr)[4] == True
- assert np.isinf(arr)[6] == False
- assert np.signbit(arr)[8] == True
-
@pytest.mark.parametrize("intrin, func", [("ceil", math.ceil),
("trunc", math.trunc), ("floor", math.floor), ("rint", round)])
def test_rounding(self, intrin, func):