diff options
author | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2022-08-23 12:35:38 -0700 |
---|---|---|
committer | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2023-01-04 02:19:17 -0800 |
commit | 3725e9f3237362037095f4100979a11864cfcc04 (patch) | |
tree | addcfc639979feffd9dc8efd54db3b6774beb601 | |
parent | f73d53d1a44ac841458c5be852e05021feec5dff (diff) | |
download | numpy-3725e9f3237362037095f4100979a11864cfcc04.tar.gz |
ENH: Implement SIMD versions of isnan,isinf, isfinite and signbit
NumPy has SIMD versions of float / double `isnan`, `isinf`, `isfinite`, and `signbit` for SSE2 and AVX-512. The changes here replace the SSE2 version with one that uses their universal intrinsics. This allows other architectures to have SIMD versions of the functions too.
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 8 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 24 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 12 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_unary_fp.dispatch.c.src | 398 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 262 |
5 files changed, 427 insertions, 277 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index bf1a05ee4..f01ac4031 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -921,7 +921,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.isnan'), 'PyUFunc_IsFiniteTypeResolver', - TD(noobj, simd=[('avx512_skx', 'fd')], out='?'), + TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]), ), 'isnat': Ufunc(1, 1, None, @@ -933,19 +933,19 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.isinf'), 'PyUFunc_IsFiniteTypeResolver', - TD(noobj, simd=[('avx512_skx', 'fd')], out='?'), + TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]), ), 'isfinite': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.isfinite'), 'PyUFunc_IsFiniteTypeResolver', - TD(noobj, simd=[('avx512_skx', 'fd')], out='?'), + TD(noobj, out='?', dispatch=[('loops_unary_fp', inexactvec)]), ), 'signbit': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.signbit'), None, - TD(flts, simd=[('avx512_skx', 'fd')], out='?'), + TD(flts, out='?', dispatch=[('loops_unary_fp', inexactvec)]), ), 'copysign': Ufunc(2, 1, None, diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 7b070a084..2b70a4b9a 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1306,6 +1306,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const * * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# + * #fd = 1, 1, 0# + * #VCHK = 1, 1, 0# */ /**begin repeat1 * #kind = logical_and, logical_or# @@ -1342,32 +1344,22 @@ NPY_NO_EXPORT void } } +#if !@fd@ /**begin repeat1 * #kind = isnan, isinf, isfinite, signbit# * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit# **/ - -/**begin repeat2 - * #ISA = , _avx512_skx# - * #isa = simd, avx512_skx# - * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX512_SKX)# - **/ - -#if @CHK@ NPY_NO_EXPORT void -@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - if (!run_@kind@_@isa@_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *((npy_bool *)op1) = @func@(in1) != 0; - } + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *((npy_bool *)op1) = @func@(in1) != 0; } npy_clear_floatstatus_barrier((char*)dimensions); } -#endif -/**end repeat2**/ /**end repeat1**/ +#endif NPY_NO_EXPORT void @TYPE@_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 411d53e94..c13a6491f 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -241,7 +241,8 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, * #TYPE = FLOAT, DOUBLE# */ /**begin repeat1 - * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal# + * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal, + * isnan, isinf, isfinite, signbit# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) @@ -400,6 +401,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( * #c = f, f, , l# * #C = F, F, , L# * #half = 1, 0, 0, 0# + * #fd = 0, 1, 1, 0# */ /**begin repeat1 @@ -428,13 +430,13 @@ NPY_NO_EXPORT void /**begin repeat1 * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing# * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing# + * #dispatched = 1, 1, 1, 1, 0, 0, 0# **/ -/**begin repeat2 - * #ISA = , _avx512_skx# - **/ +#if !@fd@ || !@dispatched@ NPY_NO_EXPORT void -@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +#endif /**end repeat2**/ /**end repeat1**/ diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src index c4e7b8929..2096893b7 100644 --- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src @@ -12,10 +12,14 @@ * such small operations that this file covers. */ #define NPY_SIMD_FORCE_128 +#include <float.h> #include "numpy/npy_math.h" #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" /********************************************************** ** Scalars **********************************************************/ @@ -80,6 +84,182 @@ NPY_FINLINE double c_square_f64(double a) #define c_rint_f32 npy_rintf #define c_rint_f64 npy_rint +/******************************************************************************* + ** extra SIMD intrinsics + ******************************************************************************/ + +#if NPY_SIMD +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + * #FDMAX = FLT_MAX, DBL_MAX# + * #fd = f, # + * #ssfx = 32, 64# + */ +#if @VCHK@ + +static NPY_INLINE npyv_u@ssfx@ +npyv_isnan_@sfx@(npyv_@sfx@ v) +{ + // (v != v) >> (size - 1) + npyv_@sfx@ r = npyv_cvt_@sfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v)); + return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); +} + +static NPY_INLINE npyv_u@ssfx@ +npyv_isinf_@sfx@(npyv_@sfx@ v) +{ + // (abs(v) > fltmax) >> (size - 1) + const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@); +#if defined(NPY_HAVE_NEON) + npyv_@sfx@ r = vcagtq_@sfx@(v, fltmax); +#else + // fabs via masking of sign bit + const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); + npyv_@sfx@ r = npyv_reinterpret_@sfx@_u8(npyv_andc_u8(v, signmask)); + r = npyv_cmpgt_@sfx@(r, fltmax); +#endif + return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); +} + +static NPY_INLINE npyv_u@ssfx@ +npyv_isfinite_@sfx@(npyv_@sfx@ v) +{ + // ((v & signmask) <= fltmax) >> (size-1) + const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@); +#if defined(NPY_HAVE_NEON) + npyv_@sfx@ r = vcaleq_@sfx@(v, fltmax); +#else + // fabs via masking of sign bit + const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); + npyv_@sfx@ r = npyv_reinterpret_@sfx@_u8(npyv_andc_u8(v, signmask)); + r = npyv_cmple_@sfx@(r, fltmax); +#endif + return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1); +} + +static NPY_INLINE npyv_u@ssfx@ +npyv_signbit_@sfx@(npyv_@sfx@ v) +{ + return npyv_shri_u@ssfx@(v, (sizeof(npyv_lanetype_@sfx@)*8)-1); +} + +#endif // @VCHK@ +/**end repeat**/ + +#if defined(NPY_HAVE_NEON) + #define PREPACK_ISFINITE 1 + #define PREPACK_SIGNBIT 1 + + #if NPY_SIMD_F32 + + static NPY_INLINE npyv_u8 + npyv_isfinite_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) + { + // F32 exponent is 8-bits, which means we can pack multiple into + // a single vector. We shift out sign bit so that we're left + // with only exponent in high byte. If not all bits are set, + // then we've got a finite number. + uint8x16x4_t tbl; + tbl.val[0] = npyv_shli_u32(v0, 1); + tbl.val[1] = npyv_shli_u32(v1, 1); + tbl.val[2] = npyv_shli_u32(v2, 1); + tbl.val[3] = npyv_shli_u32(v3, 1); + + const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; + npyv_u8 r = vqtbl4q_u8(tbl, permute); + + const npyv_u8 expmask = npyv_setall_u8(0xff); + r = npyv_cmpneq_u8(r, expmask); + r = vshrq_n_u8(r, 7); + return r; + } + + static NPY_INLINE npyv_u8 + npyv_signbit_4x_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) + { + // We only need high byte for signbit, which means we can pack + // multiple inputs into a single vector. + uint8x16x4_t tbl; + tbl.val[0] = v0; + tbl.val[1] = v1; + tbl.val[2] = v2; + tbl.val[3] = v3; + + const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; + npyv_u8 r = vqtbl4q_u8(tbl, permute); + r = vshrq_n_u8(r, 7); + return r; + } + + #endif // NPY_SIMD_F32 + + #if NPY_SIMD_F64 + + static NPY_INLINE npyv_u8 + npyv_isfinite_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) + { + // F64 exponent is 11-bits, which means we can pack multiple into + // a single vector. We'll need to use u16 to fit all exponent + // bits. If not all bits are set, then we've got a finite number. + uint8x16x4_t t0123, t4567; + t0123.val[0] = v0; + t0123.val[1] = v1; + t0123.val[2] = v2; + t0123.val[3] = v3; + t4567.val[0] = v4; + t4567.val[1] = v5; + t4567.val[2] = v6; + t4567.val[3] = v7; + + const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63}; + npyv_u16 r0 = vqtbl4q_u8(t0123, permute); + npyv_u16 r1 = vqtbl4q_u8(t4567, permute); + + const npyv_u16 expmask = npyv_setall_u16(0x7ff0); + r0 = npyv_and_u16(r0, expmask); + r0 = npyv_cmpneq_u16(r0, expmask); + r0 = npyv_shri_u16(r0, 15); + r1 = npyv_and_u16(r1, expmask); + r1 = npyv_cmpneq_u16(r1, expmask); + r1 = npyv_shri_u16(r1, 15); + + npyv_u8 r = npyv_pack_b8_b16(r0, r1); + return r; + } + + static NPY_INLINE npyv_u8 + npyv_signbit_8x_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) + { + // We only need high byte for signbit, which means we can pack + // multiple inputs into a single vector. + + // vuzp2 faster than vtbl for f64 + npyv_u32 v01 = vuzp2q_u32(v0, v1); + npyv_u32 v23 = vuzp2q_u32(v2, v3); + npyv_u32 v45 = vuzp2q_u32(v4, v5); + npyv_u32 v67 = vuzp2q_u32(v6, v7); + + npyv_u16 v0123 = vuzp2q_u16(v01, v23); + npyv_u16 v4567 = vuzp2q_u16(v45, v67); + + npyv_u8 r = vuzp2q_u8(v0123, v4567); + r = vshrq_n_u8(r, 7); + return r; + } + + #endif // NPY_SIMD_F64 + +#else + #define PREPACK_ISFINITE 0 + #define PREPACK_SIGNBIT 0 +#endif // defined(NPY_HAVE_NEON) + +#endif // NPY_SIMD + /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ @@ -149,6 +329,9 @@ NPY_FINLINE double c_square_f64(double a) * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + * #FDMAX = FLT_MAX, DBL_MAX# + * #fd = f, # + * #ssfx = 32, 64# */ #if @VCHK@ /**begin repeat1 @@ -249,10 +432,179 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@ } /**end repeat2**/ /**end repeat1**/ + +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit# + * #PREPACK = 0, 0, PREPACK_ISFINITE, PREPACK_SIGNBIT# + */ +/**begin repeat2 + * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# + * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG# + * #unroll = 1, 1, 1, 1# + */ +static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ +(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len) +{ + const npyv_lanetype_@sfx@ *ip = src; + npy_bool *op = dst; + + // How many vectors can be packed into a u8 / bool vector? + #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@) + assert(PACK_FACTOR == 4 || PACK_FACTOR == 8); + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * @unroll@ * PACK_FACTOR; + + // unrolled iterations + for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { + /**begin repeat3 + * #N = 0, 1, 2, 3# + */ + #if @unroll@ > @N@ + // Load vectors + #if @STYPE@ == CONTIG + // contiguous input + npyv_@sfx@ v0_@N@ = npyv_load_@sfx@(ip + vstep * (0 + PACK_FACTOR * @N@)); // 2 * (0 + 8 * n) + npyv_@sfx@ v1_@N@ = npyv_load_@sfx@(ip + vstep * (1 + PACK_FACTOR * @N@)); + npyv_@sfx@ v2_@N@ = npyv_load_@sfx@(ip + vstep * (2 + PACK_FACTOR * @N@)); + npyv_@sfx@ v3_@N@ = npyv_load_@sfx@(ip + vstep * (3 + PACK_FACTOR * @N@)); + #if PACK_FACTOR == 8 + npyv_@sfx@ v4_@N@ = npyv_load_@sfx@(ip + vstep * (4 + PACK_FACTOR * @N@)); + npyv_@sfx@ v5_@N@ = npyv_load_@sfx@(ip + vstep * (5 + PACK_FACTOR * @N@)); + npyv_@sfx@ v6_@N@ = npyv_load_@sfx@(ip + vstep * (6 + PACK_FACTOR * @N@)); + npyv_@sfx@ v7_@N@ = npyv_load_@sfx@(ip + vstep * (7 + PACK_FACTOR * @N@)); // 2 * (7 + 8 * n) --> 32 + 7: 39 * 2: 78 + #endif + #else + // non-contiguous input + npyv_@sfx@ v0_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(0 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v1_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(1 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v2_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(2 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v3_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(3 + PACK_FACTOR * @N@), istride); + #if PACK_FACTOR == 8 + npyv_@sfx@ v4_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(4 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v5_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(5 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v6_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(6 + PACK_FACTOR * @N@), istride); + npyv_@sfx@ v7_@N@ = npyv_loadn_@sfx@(ip + istride*vstep*(7 + PACK_FACTOR * @N@), istride); + #endif + #endif + #endif // @unroll@ > @N@ + /**end repeat3**/ + + /**begin repeat3 + * #N = 0, 1, 2, 3# + */ + #if @unroll@ > @N@ + #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + #if @ssfx@ == 32 + npyv_u8 r_@N@ = npyv_@kind@_4x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@); + #elif @ssfx@ == 64 + npyv_u8 r_@N@ = npyv_@kind@_8x_@sfx@(v0_@N@, v1_@N@, v2_@N@, v3_@N@, + v4_@N@, v5_@N@, v6_@N@, v7_@N@); + #endif + #else + npyv_u@ssfx@ r0_@N@ = npyv_@kind@_@sfx@(v0_@N@); + npyv_u@ssfx@ r1_@N@ = npyv_@kind@_@sfx@(v1_@N@); + npyv_u@ssfx@ r2_@N@ = npyv_@kind@_@sfx@(v2_@N@); + npyv_u@ssfx@ r3_@N@ = npyv_@kind@_@sfx@(v3_@N@); + #if PACK_FACTOR == 8 + npyv_u@ssfx@ r4_@N@ = npyv_@kind@_@sfx@(v4_@N@); + npyv_u@ssfx@ r5_@N@ = npyv_@kind@_@sfx@(v5_@N@); + npyv_u@ssfx@ r6_@N@ = npyv_@kind@_@sfx@(v6_@N@); + npyv_u@ssfx@ r7_@N@ = npyv_@kind@_@sfx@(v7_@N@); + #endif // PACK_FACTOR == 8 + #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + #endif // @unroll@ > @N@ + /**end repeat3**/ + + /**begin repeat3 + * #N = 0, 1, 2, 3# + */ + #if @unroll@ > @N@ + #if @DTYPE@ == CONTIG + #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + // Nothing to do, results already packed + #else + // Pack results + #if PACK_FACTOR == 4 + npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b32(r0_@N@, r1_@N@, r2_@N@, r3_@N@)); + #elif PACK_FACTOR == 8 + npyv_u8 r_@N@ = npyv_cvt_u8_b8(npyv_pack_b8_b64(r0_@N@, r1_@N@, r2_@N@, r3_@N@, + r4_@N@, r5_@N@, r6_@N@, r7_@N@)); + #endif // PACK_FACTOR == 8 + #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + + npyv_store_u8(op + vstep * PACK_FACTOR * @N@, r_@N@); + + #else // @DTYPE@ == CONTIG + #if @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + // Results are packed, so we can just loop over them + npy_uint8 lane_@N@[npyv_nlanes_u8]; + npyv_store_u8(lane_@N@, r_@N@); + for (int ln=0; ln<npyv_nlanes_u8; ++ln){ + op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)]; + } + #else + // Results are not packed. Use template to loop. + /**begin repeat4 + * #R = 0, 1, 2, 3, 4, 5, 6, 7# + */ + #if @R@ < PACK_FACTOR + npy_uint8 lane@R@_@N@[npyv_nlanes_u8]; + npyv_store_u8(lane@R@_@N@, r@R@_@N@); + op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)]; + op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)]; + #if npyv_nlanes_@sfx@ == 4 + op[(2 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[2 * sizeof(npyv_lanetype_@sfx@)]; + op[(3 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[3 * sizeof(npyv_lanetype_@sfx@)]; + #endif + #endif + /**end repeat4**/ + #endif // @PREPACK@ && (@ssfx@ == 32 || @ssfx@ == 64) + #endif // @DTYPE@ == CONTIG + #endif // @unroll@ > @N@ + /**end repeat3**/ + } + + // vector-sized iterations + for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) { + #if @STYPE@ == CONTIG + npyv_@sfx@ v = npyv_load_@sfx@(ip); + #else + npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); + #endif + + npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v); + + npy_uint8 lane[npyv_nlanes_u8]; + npyv_store_u8(lane, r); + + op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)]; + op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)]; + #if npyv_nlanes_@sfx@ == 4 + op[2*ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)]; + op[3*ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)]; + #endif + } + + #undef PACK_FACTOR + + // Scalar loop to finish off + for (; len > 0; --len, ip += istride, op += ostride) { + *op = (npy_@kind@(*ip) != 0); + } + + + npyv_cleanup(); +} +/**end repeat2**/ +/**end repeat1**/ + #endif // @VCHK@ /**end repeat**/ #undef WORKAROUND_CLANG_RECIPROCAL_BUG +#undef PREPACK_ISFINITE +#undef PREPACK_SIGNBIT /******************************************************************************** ** Defining ufunc inner functions @@ -316,4 +668,50 @@ clear:; #endif } /**end repeat1**/ + +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + const char *ip = args[0]; + char *op = args[1]; + const npy_intp istep = steps[0]; + const npy_intp ostep = steps[1]; + npy_intp len = dimensions[0]; +#if @VCHK@ + const int ilsize = sizeof(npyv_lanetype_@sfx@); + const int olsize = sizeof(npy_bool); + const npy_intp istride = istep / ilsize; + const npy_intp ostride = ostep / olsize; + assert(len <= 1 || (istep % ilsize == 0 && ostep % olsize == 0)); + + if (!is_mem_overlap(ip, istep, op, ostep, len) && + npyv_loadable_stride_@sfx@(istride) && + npyv_storable_stride_@sfx@(ostride)) + { + if (istride == 1 && ostride == 1) { + simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len); + } + else if (ostride == 1) { + simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len); + } + else if (istride == 1) { + simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len); + } else { + simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len); + } + } else +#endif // @VCHK@ + { + UNARY_LOOP { + const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1; + *((npy_bool *)op1) = (npy_@kind@(in) != 0); + } + } + + npy_clear_floatstatus_barrier((char*)dimensions); +} +/**end repeat1**/ /**end repeat**/ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 10c44ce30..1a5c0ffb8 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -89,63 +89,33 @@ run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const n */ /**begin repeat - * #type = npy_float, npy_double, npy_longdouble# - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #EXISTS = 1, 1, 0# - */ - -/**begin repeat1 - * #func = isnan, isfinite, isinf, signbit# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ -static inline NPY_GCC_TARGET_AVX512_SKX void -AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride); -#endif - -static inline int -run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ - if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(npy_bool), 64)) { - AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]); - return 1; - } - else { - return 0; - } -#endif - return 0; -} - - -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #vector = 1, 1, 0# * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 # */ + /**begin repeat1 - * #kind = isnan, isfinite, isinf, signbit# + * #func = negative# + * #check = IS_BLOCKABLE_UNARY# + * #name = unary# */ + #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS +/* prototypes */ static void -sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n); +sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n); #endif static inline int -run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS - if (steps[0] == sizeof(@type@) && steps[1] == 1 && - npy_is_aligned(args[0], sizeof(@type@))) { - sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]); + if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) { + sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]); return 1; } #endif @@ -189,139 +159,6 @@ NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) } /**end repeat**/ -/**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #scalarf = npy_sqrtf, npy_sqrt# - * #c = f, # - * #vtype = __m128, __m128d# - * #vtype256 = __m256, __m256d# - * #vtype512 = __m512, __m512d# - * #vpre = _mm, _mm# - * #vpre256 = _mm256, _mm256# - * #vpre512 = _mm512, _mm512# - * #vsuf = ps, pd# - * #vsufs = ss, sd# - * #nan = NPY_NANF, NPY_NAN# - * #double = 0, 1# - * #cast = _mm_castps_si128, _mm_castpd_si128# - */ -/* - * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1 - * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the - * calling convention leading to C2719 on 32 bit, see #4795 - */ -NPY_FINLINE void -sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4, - npy_bool * op) -{ - const __m128i mask = @vpre@_set1_epi8(0x1); - __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2)); - __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4)); - __m128i rr = @vpre@_packs_epi16(ir1, ir2); -#if @double@ - rr = @vpre@_packs_epi16(rr, rr); - rr = @vpre@_and_si128(rr, mask); - @vpre@_storel_epi64((__m128i*)op, rr); -#else - rr = @vpre@_and_si128(rr, mask); - @vpre@_storeu_si128((__m128i*)op, rr); -#endif -} - -static void -sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) { - op[i] = npy_signbit(ip1[i]) != 0; - } - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - int r = @vpre@_movemask_@vsuf@(a); - if (sizeof(@type@) == 8) { - op[i] = r & 1; - op[i + 1] = (r >> 1); - } - else { - op[i] = r & 1; - op[i + 1] = (r >> 1) & 1; - op[i + 2] = (r >> 2) & 1; - op[i + 3] = (r >> 3); - } - } - LOOP_BLOCKED_END { - op[i] = npy_signbit(ip1[i]) != 0; - } -} - -/**begin repeat1 - * #kind = isnan, isfinite, isinf# - * #var = 0, 1, 2# - */ - -static void -sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n) -{ -#if @var@ != 0 /* isinf/isfinite */ - /* signbit mask 0x7FFFFFFF after andnot */ - const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@); - const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(), - @vpre@_setzero_@vsuf@()); -#if @double@ - const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX); -#else - const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX); -#endif -#endif - LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) { - op[i] = npy_@kind@(ip1[i]) != 0; - } - LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ r1, r2, r3, r4; -#if @var@ != 0 /* isinf/isfinite */ - /* fabs via masking of sign bit */ - r1 = @vpre@_andnot_@vsuf@(mask, a); - r2 = @vpre@_andnot_@vsuf@(mask, b); - r3 = @vpre@_andnot_@vsuf@(mask, c); - r4 = @vpre@_andnot_@vsuf@(mask, d); -#if @var@ == 1 /* isfinite */ - /* negative compare against max float, nan is always true */ - r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax); - r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax); - r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax); - r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax); -#else /* isinf */ - r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1); - r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2); - r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3); - r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4); -#endif - /* flip results to what we want (andnot as there is no sse not) */ - r1 = @vpre@_andnot_@vsuf@(r1, ones); - r2 = @vpre@_andnot_@vsuf@(r2, ones); - r3 = @vpre@_andnot_@vsuf@(r3, ones); - r4 = @vpre@_andnot_@vsuf@(r4, ones); -#endif -#if @var@ == 0 /* isnan */ - r1 = @vpre@_cmpneq_@vsuf@(a, a); - r2 = @vpre@_cmpneq_@vsuf@(b, b); - r3 = @vpre@_cmpneq_@vsuf@(c, c); - r4 = @vpre@_cmpneq_@vsuf@(d, d); -#endif - sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]); - } - LOOP_BLOCKED_END { - op[i] = npy_@kind@(ip1[i]) != 0; - } -} - -/**end repeat1**/ -/**end repeat**/ - /* bunch of helper functions used in ISA_exp/log_FLOAT*/ #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS @@ -714,85 +551,6 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d /**end repeat**/ /**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #num_lanes = 16, 8# - * #vsuffix = ps, pd# - * #mask = __mmask16, __mmask8# - * #vtype = __m512, __m512d# - * #scale = 4, 8# - * #vindextype = __m512i, __m256i# - * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# - * #episize = epi32, epi64# - */ - -/**begin repeat1 - * #func = isnan, isfinite, isinf, signbit# - * #IMM8 = 0x81, 0x99, 0x18, 0x04# - * #is_finite = 0, 1, 0, 0# - * #is_signbit = 0, 0, 0, 1# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static inline NPY_GCC_TARGET_AVX512_SKX void -AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps) -{ - const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@); - npy_intp num_remaining_elements = array_size; - - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); -#if @is_signbit@ - @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0); -#endif - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum - * index will fit in an int32 as a precondition for this function via - * IS_OUTPUT_BLOCKABLE_UNARY - */ - - npy_int32 index_ip[@num_lanes@]; - for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { - index_ip[ii] = ii*stride_ip; - } - @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]); - @vtype@ zeros_f = _mm512_setzero_@vsuffix@(); - __m512i ones = _mm512_set1_@episize@(1); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - } - @vtype@ x1; - if (stride_ip == 1) { - x1 = avx512_masked_load_@vsuffix@(load_mask, ip); - } - else { - x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask); - } -#if @is_signbit@ - x1 = _mm512_and_@vsuffix@(x1,signbit); -#endif - - @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@); -#if @is_finite@ - fpclassmask = _mm512_knot(fpclassmask); -#endif - - __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones); - _mm_mask_storeu_epi8(op, load_mask, out); - - ip += @num_lanes@*stride_ip; - op += @num_lanes@; - num_remaining_elements -= @num_lanes@; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * #TYPE = CFLOAT, CDOUBLE# * #type = npy_float, npy_double# * #num_lanes = 16, 8# |