diff options
| author | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2022-10-05 20:26:04 -0700 |
|---|---|---|
| committer | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2023-01-04 02:19:18 -0800 |
| commit | 7e4746782454d2f2bbcd87bf6d8c0b89d000cdd5 (patch) | |
| tree | 395c9d55f275e8932b96c323f99b589e077a146a /numpy | |
| parent | baa003e0c5a8616e26b86071283deb86c79ac5f0 (diff) | |
| download | numpy-7e4746782454d2f2bbcd87bf6d8c0b89d000cdd5.tar.gz | |
re-enable vx and vxe (avoid perf regressions) for existing code by splitting this new code into a new file
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/code_generators/generate_umath.py | 8 | ||||
| -rw-r--r-- | numpy/core/setup.py | 1 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.h.src | 17 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_unary_fp.dispatch.c.src | 425 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src | 556 | ||||
| -rw-r--r-- | numpy/core/tests/test_simd.py | 33 |
6 files changed, 577 insertions, 463 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index f01ac4031..e7e63f06f 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -921,7 +921,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.isnan'), 'PyUFunc_IsFiniteTypeResolver', - TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]), + TD(noobj, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]), ), 'isnat': Ufunc(1, 1, None, @@ -933,19 +933,19 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.isinf'), 'PyUFunc_IsFiniteTypeResolver', - TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]), + TD(noobj, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]), ), 'isfinite': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.isfinite'), 'PyUFunc_IsFiniteTypeResolver', - TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]), + TD(noobj, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]), ), 'signbit': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.signbit'), None, - TD(flts, out='?', dispatch=[('loops_unary_fp', inexactvec)]), + TD(flts, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]), ), 'copysign': Ufunc(2, 1, None, diff --git a/numpy/core/setup.py b/numpy/core/setup.py index a5bd4a056..691c53f16 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1007,6 +1007,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'loops.c.src'), join('src', 'umath', 'loops_unary.dispatch.c.src'), join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), + join('src', 'umath', 'loops_unary_fp_le.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), join('src', 'umath', 'loops_logical.dispatch.c.src'), diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index c13a6491f..26742946f 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -241,8 +241,21 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, * #TYPE = FLOAT, DOUBLE# */ /**begin repeat1 - * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal, - * isnan, isinf, isfinite, signbit# + * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_fp_le.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src index 54459d545..c4e7b8929 100644 --- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src @@ -3,13 +3,8 @@ ** sse2 sse41 ** vsx2 ** neon asimd + ** vx vxe **/ - -/** - * We ran into lots of test failures trying to enable this file for - * VSE and VE on s390x (qemu) so avoiding these targets for now. -*/ - /** * Force use SSE only on x86, even if AVX2 or AVX512F are enabled * through the baseline, since scatter(AVX512F) and gather very costly @@ -17,17 +12,10 @@ * such small operations that this file covers. */ #define NPY_SIMD_FORCE_128 -#define _UMATHMODULE -#define _MULTIARRAYMODULE -#define NPY_NO_DEPRECATED_API NPY_API_VERSION -#include <float.h> #include "numpy/npy_math.h" #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" -#include "lowlevel_strided_loops.h" -// Provides the various *_LOOP macros -#include "fast_loop_macros.h" /********************************************************** ** Scalars **********************************************************/ @@ -92,198 +80,6 @@ NPY_FINLINE double c_square_f64(double a) #define c_rint_f32 npy_rintf #define c_rint_f64 npy_rint -/******************************************************************************* - ** extra SIMD intrinsics - ******************************************************************************/ - -#if NPY_SIMD -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - * #sfx = f32, f64# - * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# - * #FDMAX = FLT_MAX, DBL_MAX# - * #fd = f, # - * #ssfx = 32, 64# - */ -#if @VCHK@ - -static NPY_INLINE npyv_u@ssfx@ -npyv_isnan_@sfx@(npyv_@sfx@ v) -{ - // (v != v) >> (size - 1) -#if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41) - // sse npyv_cmpneq_@sfx@ define includes a cast already - npyv_u@ssfx@ r = npyv_cmpneq_@sfx@(v, v); -#else - npyv_u@ssfx@ r = npyv_cvt_u@ssfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v)); -#endif - return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); -} - -static NPY_INLINE npyv_u@ssfx@ -npyv_isinf_@sfx@(npyv_@sfx@ v) -{ - // (abs(v) > fltmax) >> (size - 1) - const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@); -#if defined(NPY_HAVE_NEON) - npyv_u@ssfx@ r = vcagtq_@sfx@(v, fltmax); -#else - // fabs via masking of sign bit - const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); - npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask)); - #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41) - // return cast already done in npyv_cmpgt_@sfx@ - npyv_u@ssfx@ r = npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax); - #else - npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax)); - #endif -#endif - return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); -} - -static NPY_INLINE npyv_u@ssfx@ -npyv_isfinite_@sfx@(npyv_@sfx@ v) -{ - // ((v & signmask) <= fltmax) >> (size-1) - const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@); -#if defined(NPY_HAVE_NEON) - npyv_u@ssfx@ r = vcaleq_@sfx@(v, fltmax); -#else - // fabs via masking of sign bit - const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); - npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask)); - #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41) - // return cast already done in npyv_cmpgt_@sfx@ - npyv_u@ssfx@ r = npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax); - #else - npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax)); - #endif -#endif - return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); -} - -static NPY_INLINE npyv_u@ssfx@ -npyv_signbit_@sfx@(npyv_@sfx@ v) -{ - return npyv_shri_u@ssfx@(npyv_reinterpret_u@ssfx@_@sfx@(v), (sizeof(npyv_lanetype_@sfx@)*8)-1); -} - -#endif // @VCHK@ -/**end repeat**/ - -// In these functions we use vqtbl4q_u8 which is only available on aarch64 -#if defined(NPY_HAVE_NEON) && defined(__aarch64__) - #define PREPACK_ISFINITE 1 - #define PREPACK_SIGNBIT 1 - - #if NPY_SIMD_F32 - - static NPY_INLINE npyv_u8 - npyv_isfinite_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) - { - // F32 exponent is 8-bits, which means we can pack multiple into - // a single vector. We shift out sign bit so that we're left - // with only exponent in high byte. If not all bits are set, - // then we've got a finite number. - uint8x16x4_t tbl; - tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1)); - tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1)); - tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1)); - tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1)); - - const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; - npyv_u8 r = vqtbl4q_u8(tbl, permute); - - const npyv_u8 expmask = npyv_setall_u8(0xff); - r = npyv_cmpneq_u8(r, expmask); - r = vshrq_n_u8(r, 7); - return r; - } - - static NPY_INLINE npyv_u8 - npyv_signbit_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) - { - // We only need high byte for signbit, which means we can pack - // multiple inputs into a single vector. - uint8x16x4_t tbl; - tbl.val[0] = npyv_reinterpret_u8_f32(v0); - tbl.val[1] = npyv_reinterpret_u8_f32(v1); - tbl.val[2] = npyv_reinterpret_u8_f32(v2); - tbl.val[3] = npyv_reinterpret_u8_f32(v3); - - const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; - npyv_u8 r = vqtbl4q_u8(tbl, permute); - r = vshrq_n_u8(r, 7); - return r; - } - - #endif // NPY_SIMD_F32 - - #if NPY_SIMD_F64 - - static NPY_INLINE npyv_u8 - npyv_isfinite_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, - npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) - { - // F64 exponent is 11-bits, which means we can pack multiple into - // a single vector. We'll need to use u16 to fit all exponent - // bits. If not all bits are set, then we've got a finite number. - uint8x16x4_t t0123, t4567; - t0123.val[0] = npyv_reinterpret_u8_f64(v0); - t0123.val[1] = npyv_reinterpret_u8_f64(v1); - t0123.val[2] = npyv_reinterpret_u8_f64(v2); - t0123.val[3] = npyv_reinterpret_u8_f64(v3); - t4567.val[0] = npyv_reinterpret_u8_f64(v4); - t4567.val[1] = npyv_reinterpret_u8_f64(v5); - t4567.val[2] = npyv_reinterpret_u8_f64(v6); - t4567.val[3] = npyv_reinterpret_u8_f64(v7); - - const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63}; - npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute)); - npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute)); - - const npyv_u16 expmask = npyv_setall_u16(0x7ff0); - r0 = npyv_and_u16(r0, expmask); - r0 = npyv_cmpneq_u16(r0, expmask); - r0 = npyv_shri_u16(r0, 15); - r1 = npyv_and_u16(r1, expmask); - r1 = npyv_cmpneq_u16(r1, expmask); - r1 = npyv_shri_u16(r1, 15); - - npyv_u8 r = npyv_pack_b8_b16(r0, r1); - return r; - } - - static NPY_INLINE npyv_u8 - npyv_signbit_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, - npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) - { - // We only need high byte for signbit, which means we can pack - // multiple inputs into a single vector. - - // vuzp2 faster than vtbl for f64 - npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1)); - npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3)); - npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5)); - npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7)); - - npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23)); - npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67)); - - npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567)); - r = vshrq_n_u8(r, 7); - return r; - } - - #endif // NPY_SIMD_F64 - -#else - #define PREPACK_ISFINITE 0 - #define PREPACK_SIGNBIT 0 -#endif // defined(NPY_HAVE_NEON) && defined(__aarch64__) - -#endif // NPY_SIMD - /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ @@ -353,9 +149,6 @@ npyv_signbit_@sfx@(npyv_@sfx@ v) * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# - * #FDMAX = FLT_MAX, DBL_MAX# - * #fd = f, # - * #ssfx = 32, 64# */ #if @VCHK@ /**begin repeat1 @@ -456,179 +249,10 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@ } /**end repeat2**/ /**end repeat1**/ - -/**begin repeat1 - * #kind = isnan, isinf, isfinite, signbit# - * #PREPACK = 0, 0, PREPACK_ISFINITE, PREPACK_SIGNBIT# - */ -/**begin repeat2 - * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# - * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG# - * #unroll = 1, 1, 1, 1# - */ -static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ -(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len) -{ - const npyv_lanetype_@sfx@ *ip = src; - npy_bool *op = dst; - - // How many vectors can be packed into a u8 / bool vector? - #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@) - assert(PACK_FACTOR == 4 || PACK_FACTOR == 8); - - const int vstep = npyv_nlanes_@sfx@; - const int wstep = vstep * @unroll@ * PACK_FACTOR; - - // unrolled iterations - for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - /**begin repeat3 - * #N = 0, 1, 2, 3# - */ - #if @unroll@ > @N@ - // Load vectors - #if @STYPE@ == CONTIG - // contiguous input - npyv_@sfx@ v0_@N@ = npyv_load_@sfx@(ip + vstep * (0 + PACK_FACTOR * @N@)); // 2 * (0 + 8 * n) - npyv_@sfx@ v1_@N@ = npyv_load_@sfx@(ip + vstep * (1 + PACK_FACTOR * @N@)); - npyv_@sfx@ v2_@N@ = npyv_load_@sfx@(ip + vstep * (2 + PACK_FACTOR * @N@)); - npyv_@sfx@ v3_@N@ = npyv_load_@sfx@(ip + vstep * (3 + PACK_FACTOR * @N@)); - #if PACK_FACTOR == 8 - npyv_@sfx@ v4_@N@ = npyv_load_@sfx@(ip + vstep * (4 + PACK_FACTOR * @N@)); - npyv_@sfx@ v5_@N@ = npyv_load_@sfx@(ip + vstep * (5 + PACK_FACTOR * @N@)); - npyv_@sfx@ v6_@N@ = npyv_load_@sfx@(ip + vstep * (6 + PACK_FACTOR * @N@)); - npyv_@sfx@ v7_@N@ = npyv_load_@sfx@(ip + vstep * (7 + PACK_FACTOR * @N@)); // 2 * (7 + 8 * n) --> 32 + 7: 39 * 2: 78 - #endif - #else - // non-contiguous input - npyv_@sfx@ v0_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(0 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v1_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(1 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v2_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(2 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v3_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(3 + PACK_FACTOR * @N@), istride); - #if PACK_FACTOR == 8 - npyv_@sfx@ v4_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(4 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v5_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(5 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v6_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(6 + PACK_FACTOR * @N@), istride); - npyv_@sfx@ v7_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(7 + PACK_FACTOR * @N@), istride); - #endif - #endif - #endif // @unroll@ > @N@ - /**end repeat3**/ - - /**begin repeat3 - * #N = 0, 1, 2, 3# - */ - #if @unroll@ > @N@ - #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - #if @ssfx@ == 32 - npyv_u8 r_@N@ = npyv_@kind@_4x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@); - #elif @ssfx@ == 64 - npyv_u8 r_@N@ = npyv_@kind@_8x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@, - v4_@N@, v5_@N@, v6_@N@, v7_@N@); - #endif - #else - npyv_b@ssfx@ r0_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v0_@N@)); - npyv_b@ssfx@ r1_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v1_@N@)); - npyv_b@ssfx@ r2_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v2_@N@)); - npyv_b@ssfx@ r3_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v3_@N@)); - #if PACK_FACTOR == 8 - npyv_b@ssfx@ r4_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v4_@N@)); - npyv_b@ssfx@ r5_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v5_@N@)); - npyv_b@ssfx@ r6_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v6_@N@)); - npyv_b@ssfx@ r7_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v7_@N@)); - #endif // PACK_FACTOR == 8 - #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - #endif // @unroll@ > @N@ - /**end repeat3**/ - - /**begin repeat3 - * #N = 0, 1, 2, 3# - */ - #if @unroll@ > @N@ - #if @DTYPE@ == CONTIG - #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - // Nothing to do, results already packed - #else - // Pack results - #if PACK_FACTOR == 4 - npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b32(r0_@N@, r1_@N@, r2_@N@, r3_@N@)); - #elif PACK_FACTOR == 8 - npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b64(r0_@N@, r1_@N@, r2_@N@, r3_@N@, - r4_@N@, r5_@N@, r6_@N@, r7_@N@)); - #endif // PACK_FACTOR == 8 - #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - - npyv_store_u8(op + vstep * PACK_FACTOR * @N@, r_@N@); - - #else // @DTYPE@ == CONTIG - #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - // Results are packed, so we can just loop over them - npy_uint8 lane_@N@[npyv_nlanes_u8]; - npyv_store_u8(lane_@N@, r_@N@); - for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){ - op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)]; - } - #else - // Results are not packed. Use template to loop. - /**begin repeat4 - * #R = 0, 1, 2, 3, 4, 5, 6, 7# - */ - #if @R@ < PACK_FACTOR - npy_uint8 lane@R@_@N@[npyv_nlanes_u8]; - npyv_store_u8(lane@R@_@N@, npyv_reinterpret_u8_u@ssfx@(r@R@_@N@)); - op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)]; - op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)]; - #if npyv_nlanes_@sfx@ == 4 - op[(2 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[2 * sizeof(npyv_lanetype_@sfx@)]; - op[(3 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[3 * sizeof(npyv_lanetype_@sfx@)]; - #endif - #endif - /**end repeat4**/ - #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) - #endif // @DTYPE@ == CONTIG - #endif // @unroll@ > @N@ - /**end repeat3**/ - } - - // vector-sized iterations - for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) { - #if @STYPE@ == CONTIG - npyv_@sfx@ v = npyv_load_@sfx@(ip); - #else - npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); - #endif - - npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v); - - npy_uint8 lane[npyv_nlanes_u8]; - npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r)); - - op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)]; - op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)]; - #if npyv_nlanes_@sfx@ == 4 - op[2*ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)]; - op[3*ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)]; - #endif - } - - #undef PACK_FACTOR - - // Scalar loop to finish off - for (; len > 0; --len, ip += istride, op += ostride) { - *op = (npy_@kind@(*ip) != 0); - } - - - npyv_cleanup(); -} -/**end repeat2**/ -/**end repeat1**/ - #endif // @VCHK@ /**end repeat**/ #undef WORKAROUND_CLANG_RECIPROCAL_BUG -#undef PREPACK_ISFINITE -#undef PREPACK_SIGNBIT /******************************************************************************** ** Defining ufunc inner functions @@ -692,51 +316,4 @@ clear:; #endif } /**end repeat1**/ - -/**begin repeat1 - * #kind = isnan, isinf, isfinite, signbit# - **/ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if @VCHK@ - const char *ip = args[0]; - char *op = args[1]; - const npy_intp istep = steps[0]; - const npy_intp ostep = steps[1]; - npy_intp len = dimensions[0]; - const int ilsize = sizeof(npyv_lanetype_@sfx@); - const int olsize = sizeof(npy_bool); - const npy_intp istride = istep / ilsize; - const npy_intp ostride = ostep / olsize; - assert(len <= 1 || ostep % olsize == 0); - - if ((istep % ilsize == 0) && - !is_mem_overlap(ip, istep, op, ostep, len) && - npyv_loadable_stride_@sfx@(istride) && - npyv_storable_stride_@sfx@(ostride)) - { - if (istride == 1 && ostride == 1) { - simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len); - } - else if (ostride == 1) { - simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len); - } - else if (istride == 1) { - simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len); - } else { - simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len); - } - } else -#endif // @VCHK@ - { - UNARY_LOOP { - const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1; - *((npy_bool *)op1) = (npy_@kind@(in) != 0); - } - } - - npy_clear_floatstatus_barrier((char*)dimensions); -} -/**end repeat1**/ /**end repeat**/ diff --git a/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src new file mode 100644 index 000000000..fed8213df --- /dev/null +++ b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src @@ -0,0 +1,556 @@ +/*@targets + ** $maxopt baseline + ** sse2 sse41 + ** vsx2 + ** neon asimd + **/ + +/** + * This code should really be merged into loops_unary_fp.dispatch.c.src + * However there is an issue with enabling the code here for VX and VXE + * as the shifts don't behave as expected. + * See the code below that references NPY__CPU_TARGET_VX and + * NPY_BIG_ENDIAN. Suspect that this is a big endian vector issue. + * + * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src + * building for VX and VXE so we don't regress performance while adding this + * code for other platforms. +*/ + +/** + * Force use SSE only on x86, even if AVX2 or AVX512F are enabled + * through the baseline, since scatter(AVX512F) and gather very costly + * to handle non-contiguous memory access comparing with SSE for + * such small operations that this file covers. +*/ +#define NPY_SIMD_FORCE_128 +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#include <float.h> +#include "numpy/npy_math.h" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + + +/******************************************************************************* + ** extra SIMD intrinsics + ******************************************************************************/ + +#if NPY_SIMD +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + * #FDMAX = FLT_MAX, DBL_MAX# + * #fd = f, # + * #ssfx = 32, 64# + */ +#if @VCHK@ + +static NPY_INLINE npyv_u@ssfx@ +npyv_isnan_@sfx@(npyv_@sfx@ v) +{ + // (v != v) >> (size - 1) +#if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41) + // sse npyv_cmpneq_@sfx@ define includes a cast already + npyv_u@ssfx@ r = npyv_cmpneq_@sfx@(v, v); +#else + npyv_u@ssfx@ r = npyv_cvt_u@ssfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v)); +#endif +#if defined(NPY__CPU_TARGET_VX) || defined(NPY__CPU_TARGET_VXE) + // don't do the shift on s390x + return r; +#else + return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); +#endif +} + +static NPY_INLINE npyv_u@ssfx@ +npyv_isinf_@sfx@(npyv_@sfx@ v) +{ + // (abs(v) > fltmax) >> (size - 1) + const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@); +#if defined(NPY_HAVE_NEON) + npyv_u@ssfx@ r = vcagtq_@sfx@(v, fltmax); +#else + // fabs via masking of sign bit + const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); + npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask)); + #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41) + // return cast already done in npyv_cmpgt_@sfx@ + npyv_u@ssfx@ r = npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax); + #else + npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmpgt_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax)); + #endif +#endif +#if defined(NPY__CPU_TARGET_VX) || defined(NPY__CPU_TARGET_VXE) + // don't do the shift on s390x + return r; +#else + return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); +#endif +} + +static NPY_INLINE npyv_u@ssfx@ +npyv_isfinite_@sfx@(npyv_@sfx@ v) +{ + // ((v & signmask) <= fltmax) >> (size-1) + const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@); +#if defined(NPY_HAVE_NEON) + npyv_u@ssfx@ r = vcaleq_@sfx@(v, fltmax); +#else + // fabs via masking of sign bit + const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); + npyv_u8 r_u8 = npyv_andc_u8(npyv_reinterpret_u8_@sfx@(v), npyv_reinterpret_u8_@sfx@(signmask)); + #if defined(NPY_HAVE_SSE2) || defined (NPY_HAVE_SSE41) + // return cast already done in npyv_cmpgt_@sfx@ + npyv_u@ssfx@ r = npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax); + #else + npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_cmple_@sfx@(npyv_reinterpret_@sfx@_u8(r_u8), fltmax)); + #endif +#endif +#if defined(NPY__CPU_TARGET_VX) || defined(NPY__CPU_TARGET_VXE) + // don't do the shift on s390x + return r; +#else + return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); +#endif +} + +static NPY_INLINE npyv_u@ssfx@ +npyv_signbit_@sfx@(npyv_@sfx@ v) +{ +#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN + // via masking of sign bit + const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); + npyv_u@ssfx@ r = npyv_reinterpret_u@ssfx@_@sfx@(npyv_and_@sfx@(v, signmask)); + return r; +#else + return npyv_shri_u@ssfx@(npyv_reinterpret_u@ssfx@_@sfx@(v), (sizeof(npyv_lanetype_@sfx@)*8)-1); +#endif +} + +#endif // @VCHK@ +/**end repeat**/ + +// In these functions we use vqtbl4q_u8 which is only available on aarch64 +#if defined(NPY_HAVE_NEON) && defined(__aarch64__) + #define PREPACK_ISFINITE 1 + #define PREPACK_SIGNBIT 1 + + #if NPY_SIMD_F32 + + static NPY_INLINE npyv_u8 + npyv_isfinite_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) + { + // F32 exponent is 8-bits, which means we can pack multiple into + // a single vector. We shift out sign bit so that we're left + // with only exponent in high byte. If not all bits are set, + // then we've got a finite number. + uint8x16x4_t tbl; + tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1)); + tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1)); + tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1)); + tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1)); + + const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; + npyv_u8 r = vqtbl4q_u8(tbl, permute); + + const npyv_u8 expmask = npyv_setall_u8(0xff); + r = npyv_cmpneq_u8(r, expmask); + r = vshrq_n_u8(r, 7); + return r; + } + + static NPY_INLINE npyv_u8 + npyv_signbit_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) + { + // We only need high byte for signbit, which means we can pack + // multiple inputs into a single vector. + uint8x16x4_t tbl; + tbl.val[0] = npyv_reinterpret_u8_f32(v0); + tbl.val[1] = npyv_reinterpret_u8_f32(v1); + tbl.val[2] = npyv_reinterpret_u8_f32(v2); + tbl.val[3] = npyv_reinterpret_u8_f32(v3); + + const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; + npyv_u8 r = vqtbl4q_u8(tbl, permute); + r = vshrq_n_u8(r, 7); + return r; + } + + #endif // NPY_SIMD_F32 + + #if NPY_SIMD_F64 + + static NPY_INLINE npyv_u8 + npyv_isfinite_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) + { + // F64 exponent is 11-bits, which means we can pack multiple into + // a single vector. We'll need to use u16 to fit all exponent + // bits. If not all bits are set, then we've got a finite number. + uint8x16x4_t t0123, t4567; + t0123.val[0] = npyv_reinterpret_u8_f64(v0); + t0123.val[1] = npyv_reinterpret_u8_f64(v1); + t0123.val[2] = npyv_reinterpret_u8_f64(v2); + t0123.val[3] = npyv_reinterpret_u8_f64(v3); + t4567.val[0] = npyv_reinterpret_u8_f64(v4); + t4567.val[1] = npyv_reinterpret_u8_f64(v5); + t4567.val[2] = npyv_reinterpret_u8_f64(v6); + t4567.val[3] = npyv_reinterpret_u8_f64(v7); + + const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63}; + npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute)); + npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute)); + + const npyv_u16 expmask = npyv_setall_u16(0x7ff0); + r0 = npyv_and_u16(r0, expmask); + r0 = npyv_cmpneq_u16(r0, expmask); + r0 = npyv_shri_u16(r0, 15); + r1 = npyv_and_u16(r1, expmask); + r1 = npyv_cmpneq_u16(r1, expmask); + r1 = npyv_shri_u16(r1, 15); + + npyv_u8 r = npyv_pack_b8_b16(r0, r1); + return r; + } + + static NPY_INLINE npyv_u8 + npyv_signbit_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) + { + // We only need high byte for signbit, which means we can pack + // multiple inputs into a single vector. + + // vuzp2 faster than vtbl for f64 + npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1)); + npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3)); + npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5)); + npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7)); + + npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23)); + npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67)); + + npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567)); + r = vshrq_n_u8(r, 7); + return r; + } + + #endif // NPY_SIMD_F64 + +#else + #define PREPACK_ISFINITE 0 + #define PREPACK_SIGNBIT 0 +#endif // defined(NPY_HAVE_NEON) && defined(__aarch64__) + +#endif // NPY_SIMD + +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +/** Notes: + * - avoid the use of libmath to unify fp/domain errors + * for both scalars and vectors among all compilers/architectures. + * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_ + * to fill the remind lanes with 1.0 to avoid divide by zero fp + * exception in reciprocal. + */ +#define CONTIG 0 +#define NCONTIG 1 + +/* + * clang has a bug that's present at -O1 or greater. When partially loading a + * vector register for a reciprocal operation, the remaining elements are set + * to 1 to avoid divide-by-zero. The partial load is paired with a partial + * store after the reciprocal operation. clang notices that the entire register + * is not needed for the store and optimizes out the fill of 1 to the remaining + * elements. This causes either a divide-by-zero or 0/0 with invalid exception + * that we were trying to avoid by filling. + * + * Using a dummy variable marked 'volatile' convinces clang not to ignore + * the explicit fill of remaining elements. If `-ftrapping-math` is + * supported, then it'll also avoid the bug. `-ftrapping-math` is supported + * on Apple clang v12+ for x86_64. It is not currently supported for arm64. + * `-ftrapping-math` is set by default of Numpy builds in + * numpy/distutils/ccompiler.py. + * + * Note: Apple clang and clang upstream have different versions that overlap + */ +#if defined(__clang__) + #if defined(__apple_build_version__) + // Apple Clang + #if __apple_build_version__ < 12000000 + // Apple Clang before v12 + #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 + #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) + // Apple Clang after v12, targeting i386 or x86_64 + #define WORKAROUND_CLANG_RECIPROCAL_BUG 0 + #else + // Apple Clang after v12, not targeting i386 or x86_64 + #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 + #endif + #else + // Clang, not Apple Clang + #if __clang_major__ < 10 + // Clang before v10 + #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 + #elif defined(_MSC_VER) + // clang-cl has the same bug + #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 + #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) + // Clang v10+, targeting i386 or x86_64 + #define WORKAROUND_CLANG_RECIPROCAL_BUG 0 + #else + // Clang v10+, not targeting i386 or x86_64 + #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 + #endif + #endif +#else +// Not a Clang compiler +#define WORKAROUND_CLANG_RECIPROCAL_BUG 0 +#endif + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + * #FDMAX = FLT_MAX, DBL_MAX# + * #fd = f, # + * #ssfx = 32, 64# + */ +#if @VCHK@ +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit# + * #PREPACK = 0, 0, PREPACK_ISFINITE, PREPACK_SIGNBIT# + */ +/**begin repeat2 + * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# + * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG# + * #unroll = 1, 1, 1, 1# + */ +static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ +(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len) +{ + const npyv_lanetype_@sfx@ *ip = src; + npy_bool *op = dst; + + // How many vectors can be packed into a u8 / bool vector? + #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@) + assert(PACK_FACTOR == 4 || PACK_FACTOR == 8); + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * @unroll@ * PACK_FACTOR; + + // unrolled iterations + for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { + /**begin repeat3 + * #N = 0, 1, 2, 3# + */ + #if @unroll@ > @N@ + // Load vectors + #if @STYPE@ == CONTIG + // contiguous input + npyv_@sfx@ v0_@N@ = npyv_load_@sfx@(ip + vstep * (0 + PACK_FACTOR * @N@)); // 2 * (0 + 8 * n) + npyv_@sfx@ v1_@N@ = npyv_load_@sfx@(ip + vstep * (1 + PACK_FACTOR * @N@)); + npyv_@sfx@ v2_@N@ = npyv_load_@sfx@(ip + vstep * (2 + PACK_FACTOR * @N@)); + npyv_@sfx@ v3_@N@ = npyv_load_@sfx@(ip + vstep * (3 + PACK_FACTOR * @N@)); + #if PACK_FACTOR == 8 + npyv_@sfx@ v4_@N@ = npyv_load_@sfx@(ip + vstep * (4 + PACK_FACTOR * @N@)); + npyv_@sfx@ v5_@N@ = npyv_load_@sfx@(ip + vstep * (5 + PACK_FACTOR * @N@)); + npyv_@sfx@ v6_@N@ = npyv_load_@sfx@(ip + vstep * (6 + PACK_FACTOR * @N@)); + npyv_@sfx@ v7_@N@ = npyv_load_@sfx@(ip + vstep * (7 + PACK_FACTOR * @N@)); // 2 * (7 + 8 * n) --> 32 + 7: 39 * 2: 78 + #endif + #else + // non-contiguous input + npyv_@sfx@ v0_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(0 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v1_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(1 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v2_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(2 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v3_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(3 + PACK_FACTOR * @N@), istride); + #if PACK_FACTOR == 8 + npyv_@sfx@ v4_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(4 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v5_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(5 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v6_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(6 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v7_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(7 + PACK_FACTOR * @N@), istride); + #endif + #endif + #endif // @unroll@ > @N@ + /**end repeat3**/ + + /**begin repeat3 + * #N = 0, 1, 2, 3# + */ + #if @unroll@ > @N@ + #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + #if @ssfx@ == 32 + npyv_u8 r_@N@ = npyv_@kind@_4x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@); + #elif @ssfx@ == 64 + npyv_u8 r_@N@ = npyv_@kind@_8x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@, + v4_@N@, v5_@N@, v6_@N@, v7_@N@); + #endif + #else + npyv_b@ssfx@ r0_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v0_@N@)); + npyv_b@ssfx@ r1_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v1_@N@)); + npyv_b@ssfx@ r2_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v2_@N@)); + npyv_b@ssfx@ r3_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v3_@N@)); + #if PACK_FACTOR == 8 + npyv_b@ssfx@ r4_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v4_@N@)); + npyv_b@ssfx@ r5_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v5_@N@)); + npyv_b@ssfx@ r6_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v6_@N@)); + npyv_b@ssfx@ r7_@N@ = npyv_cvt_b@ssfx@_u@ssfx@(npyv_@kind@_@sfx@(v7_@N@)); + #endif // PACK_FACTOR == 8 + #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + #endif // @unroll@ > @N@ + /**end repeat3**/ + + /**begin repeat3 + * #N = 0, 1, 2, 3# + */ + #if @unroll@ > @N@ + #if @DTYPE@ == CONTIG + #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + // Nothing to do, results already packed + #else + // Pack results + #if PACK_FACTOR == 4 + npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b32(r0_@N@, r1_@N@, r2_@N@, r3_@N@)); + #elif PACK_FACTOR == 8 + npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b64(r0_@N@, r1_@N@, r2_@N@, r3_@N@, + r4_@N@, r5_@N@, r6_@N@, r7_@N@)); + #endif // PACK_FACTOR == 8 + #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + + npyv_store_u8(op + vstep * PACK_FACTOR * @N@, r_@N@); + + #else // @DTYPE@ == CONTIG + #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + // Results are packed, so we can just loop over them + npy_uint8 lane_@N@[npyv_nlanes_u8]; + npyv_store_u8(lane_@N@, r_@N@); + for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){ + op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)]; + } + #else + // Results are not packed. Use template to loop. + /**begin repeat4 + * #R = 0, 1, 2, 3, 4, 5, 6, 7# + */ + #if @R@ < PACK_FACTOR + npy_uint8 lane@R@_@N@[npyv_nlanes_u8]; + npyv_store_u8(lane@R@_@N@, npyv_reinterpret_u8_u@ssfx@(r@R@_@N@)); + op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)]; + op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)]; + #if npyv_nlanes_@sfx@ == 4 + op[(2 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[2 * sizeof(npyv_lanetype_@sfx@)]; + op[(3 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[3 * sizeof(npyv_lanetype_@sfx@)]; + #endif + #endif + /**end repeat4**/ + #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + #endif // @DTYPE@ == CONTIG + #endif // @unroll@ > @N@ + /**end repeat3**/ + } + + // vector-sized iterations + for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) { + #if @STYPE@ == CONTIG + npyv_@sfx@ v = npyv_load_@sfx@(ip); + #else + npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); + #endif + + npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v); + + npy_uint8 lane[npyv_nlanes_u8]; + npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r)); + + op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)]; + op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)]; + #if npyv_nlanes_@sfx@ == 4 + op[2*ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)]; + op[3*ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)]; + #endif + } + + #undef PACK_FACTOR + + // Scalar loop to finish off + for (; len > 0; --len, ip += istride, op += ostride) { + *op = (npy_@kind@(*ip) != 0); + } + + + npyv_cleanup(); +} +/**end repeat2**/ +/**end repeat1**/ + +#endif // @VCHK@ +/**end repeat**/ + +#undef WORKAROUND_CLANG_RECIPROCAL_BUG +#undef PREPACK_ISFINITE +#undef PREPACK_SIGNBIT + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + */ + +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if @VCHK@ + const char *ip = args[0]; + char *op = args[1]; + const npy_intp istep = steps[0]; + const npy_intp ostep = steps[1]; + npy_intp len = dimensions[0]; + const int ilsize = sizeof(npyv_lanetype_@sfx@); + const int olsize = sizeof(npy_bool); + const npy_intp istride = istep / ilsize; + const npy_intp ostride = ostep / olsize; + assert(len <= 1 || ostep % olsize == 0); + + if ((istep % ilsize == 0) && + !is_mem_overlap(ip, istep, op, ostep, len) && + npyv_loadable_stride_@sfx@(istride) && + npyv_storable_stride_@sfx@(ostride)) + { + if (istride == 1 && ostride == 1) { + simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len); + } + else if (ostride == 1) { + simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len); + } + else if (istride == 1) { + simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len); + } else { + simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len); + } + } else +#endif // @VCHK@ + { + UNARY_LOOP { + const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1; + *((npy_bool *)op1) = (npy_@kind@(in) != 0); + } + } + + npy_clear_floatstatus_barrier((char*)dimensions); +} +/**end repeat1**/ +/**end repeat**/ diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 35bf90694..2c16243db 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -425,39 +425,6 @@ class _SIMD_FP(_Test_Utility): square = self.square(vdata) assert square == data_square - def test_isfinite_isinf_isnan_signbit(self): - pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() - assert np.isnan(nan) == True - assert np.isfinite(ninf) == False - assert np.isfinite(pinf) == False - assert np.isinf(ninf) == True - assert np.isinf(pinf) == True - assert np.signbit(pinf) == False - assert np.signbit(ninf) == True - - def test_array_isfinite_isinf_isnan_signbit(self): - size = 12 - for t in [np.float32, np.float64]: - arr = np.random.default_rng().random(size, t) - assert np.isnan(arr)[2] == False - assert np.isfinite(arr)[4] == True - assert np.isinf(arr)[6] == False - assert np.signbit(arr)[8] == False - - for t in [np.uint8, np.uint16, np.uint32, np.uint64]: - arr = np.random.default_rng().integers(0,100,size,dtype=t) - assert np.isnan(arr)[2] == False - assert np.isfinite(arr)[4] == True - assert np.isinf(arr)[6] == False - assert np.signbit(arr)[8] == False - - for t in [np.int8, np.int16, np.int32, np.int64]: - arr = np.random.default_rng().integers(-100,0,size,dtype=t) - assert np.isnan(arr)[2] == False - assert np.isfinite(arr)[4] == True - assert np.isinf(arr)[6] == False - assert np.signbit(arr)[8] == True - @pytest.mark.parametrize("intrin, func", [("ceil", math.ceil), ("trunc", math.trunc), ("floor", math.floor), ("rint", round)]) def test_rounding(self, intrin, func): |
