diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 10 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/math.h | 66 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 89 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 14 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src.orig | 717 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_minmax.dispatch.c.src | 967 |
6 files changed, 1105 insertions, 758 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index c20468a8f..9fa87a11e 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -516,16 +516,14 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.maximum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]), - TD(noobj, simd=[('avx512f', 'fd')]), + TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]), TD(O, f='npy_ObjectMax') ), 'minimum': Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.minimum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]), - TD(noobj, simd=[('avx512f', 'fd')]), + TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]), TD(O, f='npy_ObjectMin') ), 'clip': @@ -539,7 +537,7 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmax'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]), + TD('fdg', dispatch=[('loops_minmax', 'fdg')]), TD(noobj), TD(O, f='npy_ObjectMax') ), @@ -547,7 +545,7 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmin'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]), + TD('fdg', dispatch=[('loops_minmax', 'fdg')]), TD(noobj), TD(O, f='npy_ObjectMin') ), diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 32b5bc89e..19ea6f22f 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -153,70 +153,4 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } -#define npyv_reduce_max_u8 vmaxvq_u8 -#define npyv_reduce_max_s8 vmaxvq_s8 -#define npyv_reduce_max_u16 vmaxvq_u16 -#define npyv_reduce_max_s16 vmaxvq_s16 -#define npyv_reduce_max_u32 vmaxvq_u32 -#define npyv_reduce_max_s32 vmaxvq_s32 -NPY_FINLINE npyv_lanetype_u64 npyv_reduce_max_u64(npyv_u64 v) -{ - npyv_lanetype_u64 a = vgetq_lane_u64(v, 0); - npyv_lanetype_u64 b = vgetq_lane_u64(v, 1); - npyv_lanetype_u64 result = (a > b) ? a : b; - return result; -} -NPY_FINLINE npyv_lanetype_s64 npyv_reduce_max_s64(npyv_s64 v) -{ - npyv_lanetype_s64 a = vgetq_lane_s64(v, 0); - npyv_lanetype_s64 b = vgetq_lane_s64(v, 1); - npyv_lanetype_s64 result = (a > b) ? a : b; - return result; -} -#define npyv_reduce_max_f32 vmaxvq_f32 -#if NPY_SIMD_F64 - #define npyv_reduce_max_f64 vmaxvq_f64 -#endif // NPY_SIMD_F64 - -// Minimum, supports IEEE floating-point arithmetic (IEC 60559), -// - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. -#define npyv_reduce_maxp_f32 vmaxnmvq_f32 -#if NPY_SIMD_F64 - #define npyv_reduce_maxp_f64 vmaxnmvq_f64 -#endif // NPY_SIMD_F64 - -#define npyv_reduce_min_u8 vminvq_u8 -#define npyv_reduce_min_s8 vminvq_s8 -#define npyv_reduce_min_u16 vminvq_u16 -#define npyv_reduce_min_s16 vminvq_s16 -#define npyv_reduce_min_u32 vminvq_u32 -#define npyv_reduce_min_s32 vminvq_s32 -NPY_FINLINE npyv_lanetype_u64 npyv_reduce_min_u64(npyv_u64 v) -{ - npyv_lanetype_u64 a = vgetq_lane_u64(v, 0); - npyv_lanetype_u64 b = vgetq_lane_u64(v, 1); - npyv_lanetype_u64 result = (a < b) ? a : b; - return result; -} -NPY_FINLINE npyv_lanetype_s64 npyv_reduce_min_s64(npyv_s64 v) -{ - npyv_lanetype_s64 a = vgetq_lane_s64(v, 0); - npyv_lanetype_s64 b = vgetq_lane_s64(v, 1); - npyv_lanetype_s64 result = (a < b) ? a : b; - return result; -} -#define npyv_reduce_min_f32 vminvq_f32 -#if NPY_SIMD_F64 - #define npyv_reduce_min_f64 vminvq_f64 -#endif // NPY_SIMD_F64 - -// Minimum, supports IEEE floating-point arithmetic (IEC 60559), -// - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. -#define npyv_reduce_minp_f32 vminnmvq_f32 -#if NPY_SIMD_F64 - #define npyv_reduce_minp_f64 vminnmvq_f64 -#endif // NPY_SIMD_F64 - #endif // _NPY_SIMD_NEON_MATH_H diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 950ea011e..fa7844014 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -13,7 +13,6 @@ #include "numpy/npy_math.h" #include "numpy/halffloat.h" #include "lowlevel_strided_loops.h" -#include "loops.h" #include "npy_pycompat.h" @@ -551,7 +550,6 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void * npy_double, npy_double, npy_double, npy_double# * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0# * #c = hh,uhh,h,uh,,u,l,ul,ll,ull# - * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# */ #define @TYPE@_floor_divide @TYPE@_divide @@ -726,34 +724,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void /**end repeat1**/ -#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ -/**begin repeat1 - * #kind = maximum, minimum# - * #OP = >, <# - **/ - -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - io1 = (io1 @OP@ in2) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2; - } - } -} - -/**end repeat1**/ -#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ - NPY_NO_EXPORT void @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -1597,7 +1567,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# - * #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE# */ /**begin repeat1 * #kind = equal, not_equal, less, less_equal, greater, greater_equal, @@ -1720,65 +1689,7 @@ NPY_NO_EXPORT void } npy_clear_floatstatus_barrier((char*)dimensions); } - -#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* */ - if (IS_BINARY_REDUCE) { - if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - } - else { - BINARY_LOOP { - @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2; - *((@type@ *)op1) = in1; - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} -#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ -/**end repeat1**/ - -#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ -/**begin repeat1 - * #kind = fmax, fmin# - * #OP = >=, <=# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* */ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2; - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} /**end repeat1**/ -#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index aab1b5bbb..90115006f 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -23,14 +23,6 @@ #define BOOL_fmin BOOL_minimum /* - * The ARM NEON implementation of min/max for longdouble, longlong, and - * ulonglong assume that they're all 64-bits. If that's not true, then we'll - * use the scalar implementations instead. - */ -#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64) -#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64) - -/* ***************************************************************************** ** BOOLEAN LOOPS ** ***************************************************************************** @@ -680,32 +672,26 @@ PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, vo /**begin repeat * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, * LONG, ULONG, LONGLONG, ULONGLONG# - * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# */ -#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ /**begin repeat1 * #kind = maximum, minimum# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) /**end repeat1**/ -#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ /**end repeat**/ //---------- Float ---------- /**begin repeat * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE# */ - #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ /**begin repeat1 * #kind = maximum, minimum, fmax, fmin# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) /**end repeat1**/ -#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ /**end repeat**/ /* diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig new file mode 100644 index 000000000..aab1b5bbb --- /dev/null +++ b/numpy/core/src/umath/loops.h.src.orig @@ -0,0 +1,717 @@ +/* -*- c -*- */ +/* + * vim:syntax=c + */ + +#ifndef _NPY_UMATH_LOOPS_H_ +#define _NPY_UMATH_LOOPS_H_ + +#ifndef NPY_NO_EXPORT + #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#define BOOL_invert BOOL_logical_not +#define BOOL_add BOOL_logical_or +#define BOOL_bitwise_and BOOL_logical_and +#define BOOL_bitwise_or BOOL_logical_or +#define BOOL_logical_xor BOOL_not_equal +#define BOOL_bitwise_xor BOOL_logical_xor +#define BOOL_multiply BOOL_logical_and +#define BOOL_maximum BOOL_logical_or +#define BOOL_minimum BOOL_logical_and +#define BOOL_fmax BOOL_maximum +#define BOOL_fmin BOOL_minimum + +/* + * The ARM NEON implementation of min/max for longdouble, longlong, and + * ulonglong assume that they're all 64-bits. If that's not true, then we'll + * use the scalar implementations instead. + */ +#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64) +#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64) + +/* + ***************************************************************************** + ** BOOLEAN LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or, absolute, logical_not# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +NPY_NO_EXPORT void +BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +/* + ***************************************************************************** + ** INTEGER LOOPS + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithmetic.dispatch.h" +#endif + +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + +/**begin repeat + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ + +/**begin repeat1 + * both signed and unsigned integer types + * #s = , u# + * #S = , U# + */ + +#define @S@@TYPE@_floor_divide @S@@TYPE@_divide +#define @S@@TYPE@_fmax @S@@TYPE@_maximum +#define @S@@TYPE@_fmin @S@@TYPE@_minimum + +NPY_NO_EXPORT void +@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #isa = , _avx2# + */ + +NPY_NO_EXPORT void +@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat3 + * Arithmetic + * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, + * left_shift, right_shift# + * #OP = +, -,*, &, |, ^, <<, >># + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +/**begin repeat3 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or# + * #OP = ==, !=, >, >=, <, <=, &&, ||# + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +NPY_NO_EXPORT void +@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**begin repeat2 + * #kind = maximum, minimum# + * #OP = >, <# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +NPY_NO_EXPORT void +@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**end repeat1**/ + +/**end repeat**/ + +/* + ***************************************************************************** + ** FLOAT LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #kind = sqrt, absolute, square, reciprocal# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_umath_fp.dispatch.h" +#endif + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat**/ + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = maximum, minimum# + */ +NPY_NO_EXPORT void +@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_trigonometric.dispatch.h" +#endif +/**begin repeat + * #func = sin, cos# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_exponent_log.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * # kind = exp, log, frexp, ldexp# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = rint, ceil, floor, trunc# + */ + +/**begin repeat1 +* #TYPE = FLOAT, DOUBLE# +*/ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat2 + * #isa = avx512f, fma# + */ +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); +/**end repeat2**/ +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * Float types + * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, f, , l# + * #C = F, F, , L# + */ + + +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal, + * logical_and, logical_or# + * #OP = ==, !=, <, <=, >, >=, &&, ||# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing# + * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing# + **/ + +/**begin repeat2 + * #ISA = , _avx512_skx# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + + +/* + ***************************************************************************** + ** COMPLEX LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + */ +/**begin repeat1 + * #kind = add, subtract, multiply# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi)); +#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi)); +#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi)); +#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi)); +#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi); +#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi); + +/**begin repeat + * complex types + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, , l# + * #C = F, , L# + */ + +/**begin repeat1 + * arithmetic + * #kind = add, subtract, multiply# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind= greater, greater_equal, less, less_equal, equal, not_equal# + * #OP = CGT, CGE, CLT, CLE, CEQ, CNE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + #kind = logical_and, logical_or# + #OP1 = ||, ||# + #OP2 = &&, ||# +*/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**begin repeat1 + * #kind = isnan, isinf, isfinite# + * #func = npy_isnan, npy_isinf, npy_isfinite# + * #OP = ||, ||, &&# + **/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #isa = , _avx512f# + */ + +NPY_NO_EXPORT void +C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +#undef CGE +#undef CLE +#undef CGT +#undef CLT +#undef CEQ +#undef CNE + +/* + ***************************************************************************** + ** DATETIME LOOPS ** + ***************************************************************************** + */ + +NPY_NO_EXPORT void +TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat + * #TYPE = DATETIME, TIMEDELTA# + */ + +NPY_NO_EXPORT void +@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +#define @TYPE@_isnan @TYPE@_isnat + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = ==, !=, >, >=, <, <=# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum, fmin, fmax# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +NPY_NO_EXPORT void +DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/* Special case equivalents to above functions */ +#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide +#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide +/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ + +/* + ***************************************************************************** + ** OBJECT LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = EQ, NE, GT, GE, LT, LE# + */ +/**begin repeat1 + * #suffix = , _OO_O# + */ +NPY_NO_EXPORT void +OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ +/**end repeat**/ + +NPY_NO_EXPORT void +OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func); + +/* + ***************************************************************************** + ** MIN/MAX LOOPS ** + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_minmax.dispatch.h" +#endif + +//---------- Integers ---------- + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# + */ +#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + +//---------- Float ---------- + + /**begin repeat + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE# + */ + #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum, fmax, fmin# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + +/* + ***************************************************************************** + ** END LOOPS ** + ***************************************************************************** + */ + +#endif diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src index 06aed1bb7..c2eddcee6 100644 --- a/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -1,6 +1,6 @@ /*@targets ** $maxopt baseline - ** neon + ** neon asimd **/ #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -13,659 +13,460 @@ // Provides the various *_LOOP macros #include "fast_loop_macros.h" +/******************************************************************************* + ** Scalar intrinsics + ******************************************************************************/ +// signed/unsigned int +#define scalar_max_i(A, B) ((A > B) ? A : B) +#define scalar_min_i(A, B) ((A < B) ? A : B) +// fp, propagates NaNs +#if !defined(__aarch64__) +#define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B) +#define scalar_max_d scalar_max_f +#define scalar_max_l scalar_max_f +#define scalar_min_f(A, B) ((A <= B || npy_isnan(A)) ? A : B) +#define scalar_min_d scalar_min_f +#define scalar_min_l scalar_min_f +// fp, ignores NaNs +#define scalar_maxp_f fmaxf +#define scalar_maxp_d fmax +#define scalar_maxp_l fmaxl +#define scalar_minp_f fminf +#define scalar_minp_d fmin +#define scalar_minp_l fminl +#elif defined(__aarch64__) +/**begin repeat + * #type = npy_float, npy_double, npy_longdouble# + * #scalar_sfx = f, d, l# + * #asm_sfx = s, d, d# + */ +/**begin repeat1 + * #op = max, min, maxp, minp# + * #asm_instr = fmax, fmin, fmaxnm, fminnm# + */ +static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ + @type@ result = 0; + __asm( + "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]" + : [result] "=w" (result) + : [a] "w" (a), [b] "w" (b) + ); + return result; +} +/**end repeat1**/ +/**end repeat**/ +#endif // !defined(__aarch64__) -//############################################################################## -//## Maximum, Minimum -//############################################################################## - -#ifdef NPY_HAVE_NEON -//****************************************************************************** -//** NEON Min / Max -//****************************************************************************** - - -//------------------------------------------------------------------------------ -//-- Integer -//------------------------------------------------------------------------------ +/******************************************************************************* + ** extra SIMD intrinsics + ******************************************************************************/ +#if NPY_SIMD +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64# + * #is_64 = 0*6, 1*2# + */ +#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__) + #if !@is_64@ + #define npyv_reduce_min_@sfx@ vminvq_@sfx@ + #define npyv_reduce_max_@sfx@ vmaxvq_@sfx@ + #else + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_min_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0); + npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1); + npyv_lanetype_@sfx@ result = (a < b) ? a : b; + return result; + } + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_max_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0); + npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1); + npyv_lanetype_@sfx@ result = (a > b) ? a : b; + return result; + } + #endif // !@is_64@ +#else + /**begin repeat1 + * #intrin = min, max# + */ + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_storea_@sfx@(s, v); + npyv_lanetype_@sfx@ result = s[0]; + for(int i=1; i<npyv_nlanes_@sfx@; ++i){ + result = scalar_@intrin@_i(result, s[i]); + } + return result; + } + /**end repeat1**/ +#endif +/**end repeat**/ +#endif // NPY_SIMD /**begin repeat - * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, - * LONG, ULONG, LONGLONG, ULONGLONG# - * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, - * npy_long, npy_ulong, npy_longlong, npy_ulonglong# - * #sfx = s8, u8, s16, u16, s32, u32, - * s64, u64, s64, u64# - * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# + * #sfx = f32, f64# + * #bsfx = b32, b64# + * #simd_chk = NPY_SIMD, NPY_SIMD_F64# + * #scalar_sfx = f, d# */ +#if @simd_chk@ +#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__) + #define npyv_minn_@sfx@ vminq_@sfx@ + #define npyv_maxn_@sfx@ vmaxq_@sfx@ + #define npyv_reduce_minn_@sfx@ vminvq_@sfx@ + #define npyv_reduce_maxn_@sfx@ vmaxvq_@sfx@ + #define npyv_reduce_minp_@sfx@ vminnmvq_@sfx@ + #define npyv_reduce_maxp_@sfx@ vmaxnmvq_@sfx@ +#else + /**begin repeat1 + * #intrin = min, max# + */ + // propagates NaNs + NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b) + { + npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b); + result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b); + result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a); + return result; + } + /**end repeat1**/ + /**begin repeat1 + * #intrin = minn, maxn, minp, maxp# + * #scalar_intrin = min, max, minp, maxp# + */ + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_storea_@sfx@(s, v); + npyv_lanetype_@sfx@ result = s[0]; + for(int i=1; i<npyv_nlanes_@sfx@; ++i){ + result = scalar_@scalar_intrin@_@scalar_sfx@(result, s[i]); + } + return result; + } + /**end repeat1**/ +#endif +#endif // simd_chk +/**end repeat**/ -// Implementation below assumes longlong and ulonglong are 64-bit. -#if @HAVE_NEON_IMPL@ +/******************************************************************************* + ** Defining the SIMD kernels + ******************************************************************************/ +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64# + * #simd_chk = NPY_SIMD*9, NPY_SIMD_F64# + * #is_fp = 0*8, 1, 1# + * #scalar_sfx = i*8, f, d# + */ /**begin repeat1 -* Arithmetic -* # kind = maximum, minimum# -* # OP = >, <# -* # vop = max, min# -*/ - -#define SCALAR_OP @TYPE@_scalar_@vop@ -static inline @type@ SCALAR_OP(@type@ a, @type@ b){ - return ((a @OP@ b) ? a : b); -} + * # intrin = max, min, maxp, minp# + * # fp_only = 0, 0, 1, 1# + */ +#define SCALAR_OP scalar_@intrin@_@scalar_sfx@ +#if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@)) +#if @is_fp@ && !@fp_only@ + #define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs + #define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@ +#else + #define V_INTRIN npyv_@intrin@_@sfx@ + #define V_REDUCE_INTRIN npyv_reduce_@intrin@_@sfx@ +#endif -static inline npy_intp -simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +// contiguous input. +static inline void +simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len) { - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - @type@ *ip2 = (@type@ *)args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - + // Note, 8x unroll was chosen for best results on Apple M1 const int vectorsPerLoop = 8; - const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@); npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + npy_intp i = 0; + npyv_lanetype_@sfx@ io1 = op1[0]; + // SIMD if possible - if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ - #define NPYV_CAST (const npyv_lanetype_@sfx@ *) - npyv_@sfx@ m0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]); - npyv_@sfx@ m1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]); - npyv_@sfx@ m2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]); - npyv_@sfx@ m3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]); - npyv_@sfx@ m4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]); - npyv_@sfx@ m5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]); - npyv_@sfx@ m6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]); - npyv_@sfx@ m7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]); + if((i+elemPerLoop) <= len){ + npyv_@sfx@ m0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]); + npyv_@sfx@ m1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]); + npyv_@sfx@ m2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]); + npyv_@sfx@ m3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]); + npyv_@sfx@ m4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]); + npyv_@sfx@ m5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]); + npyv_@sfx@ m6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]); + npyv_@sfx@ m7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]); i += elemPerLoop; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - npyv_@sfx@ v0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]); - npyv_@sfx@ v1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]); - npyv_@sfx@ v2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]); - npyv_@sfx@ v3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]); - npyv_@sfx@ v4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]); - npyv_@sfx@ v5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]); - npyv_@sfx@ v6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]); - npyv_@sfx@ v7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]); - - m0 = npyv_@vop@_@sfx@(m0, v0); - m1 = npyv_@vop@_@sfx@(m1, v1); - m2 = npyv_@vop@_@sfx@(m2, v2); - m3 = npyv_@vop@_@sfx@(m3, v3); - m4 = npyv_@vop@_@sfx@(m4, v4); - m5 = npyv_@vop@_@sfx@(m5, v5); - m6 = npyv_@vop@_@sfx@(m6, v6); - m7 = npyv_@vop@_@sfx@(m7, v7); + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]); + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]); + npyv_@sfx@ v6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]); + npyv_@sfx@ v7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]); + + m0 = V_INTRIN(m0, v0); + m1 = V_INTRIN(m1, v1); + m2 = V_INTRIN(m2, v2); + m3 = V_INTRIN(m3, v3); + m4 = V_INTRIN(m4, v4); + m5 = V_INTRIN(m5, v5); + m6 = V_INTRIN(m6, v6); + m7 = V_INTRIN(m7, v7); } - #undef NPYV_CAST + m0 = V_INTRIN(m0, m1); + m2 = V_INTRIN(m2, m3); + m4 = V_INTRIN(m4, m5); + m6 = V_INTRIN(m6, m7); - m0 = npyv_@vop@_@sfx@(m0, m1); - m2 = npyv_@vop@_@sfx@(m2, m3); - m4 = npyv_@vop@_@sfx@(m4, m5); - m6 = npyv_@vop@_@sfx@(m6, m7); + m0 = V_INTRIN(m0, m2); + m4 = V_INTRIN(m4, m6); - m0 = npyv_@vop@_@sfx@(m0, m2); - m4 = npyv_@vop@_@sfx@(m4, m6); + m0 = V_INTRIN(m0, m4); - m0 = npyv_@vop@_@sfx@(m0, m4); - - @type@ r = npyv_reduce_@vop@_@sfx@(m0); + npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(m0); io1 = SCALAR_OP(io1, r); } - *((@type@ *)iop1) = io1; - - return i; -} - -static inline npy_intp -scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - char *ip2 = args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - - // 8x scalar - npy_intp elemPerLoop = 8; - if((i+elemPerLoop) <= n){ - @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); - @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); - @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); - @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); - @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); - @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); - @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); - @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); - - i += elemPerLoop; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); - @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); - @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); - @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); - @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); - @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); - @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); - @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); - - m0 = SCALAR_OP(m0, v0); - m1 = SCALAR_OP(m1, v1); - m2 = SCALAR_OP(m2, v2); - m3 = SCALAR_OP(m3, v3); - m4 = SCALAR_OP(m4, v4); - m5 = SCALAR_OP(m5, v5); - m6 = SCALAR_OP(m6, v6); - m7 = SCALAR_OP(m7, v7); - } - - m0 = SCALAR_OP(m0, m1); - m2 = SCALAR_OP(m2, m3); - m4 = SCALAR_OP(m4, m5); - m6 = SCALAR_OP(m6, m7); - - m0 = SCALAR_OP(m0, m2); - m4 = SCALAR_OP(m4, m6); - - m0 = SCALAR_OP(m0, m4); - - io1 = SCALAR_OP(io1, m0); - } - - *((@type@ *)iop1) = io1; - - return i; -} - -static inline void -run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - char *ip2 = args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - npy_intp i; - - const int vectorsPerLoop = 8; - const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); - npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; - - i = 0; - - if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ - // SIMD - do as many iterations as we can with vectors - i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i); - } else{ - // Unrolled scalar - do as many iterations as we can with unrolled loops - i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i); - } - // Scalar - finish up any remaining iterations - io1 = iop1[0]; - ip2 += i * is2; - for(; i<n; ++i, ip2 += is2){ - const @type@ in2 = *(@type@ *)ip2; + for(; i<len; ++i){ + const npyv_lanetype_@sfx@ in2 = ip[i]; io1 = SCALAR_OP(io1, in2); } - *((@type@ *)iop1) = io1; + op1[0] = io1; } -static inline npy_intp -simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +// contiguous inputs and output. +static inline void +simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2, + npyv_lanetype_@sfx@ *op1, npy_intp len) { - char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; - npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2]; - npy_intp n = dimensions[0]; - + // Note, 6x unroll was chosen for best results on Apple M1 const int vectorsPerLoop = 6; - const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + const int elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@); int elemPerLoop = vectorsPerLoop * elemPerVector; - if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){ - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - npyv_@sfx@ v0 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1)); - npyv_@sfx@ v1 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1)); - npyv_@sfx@ v2 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1)); - npyv_@sfx@ v3 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1)); - npyv_@sfx@ v4 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1)); - npyv_@sfx@ v5 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1)); - - npyv_@sfx@ u0 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2)); - npyv_@sfx@ u1 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2)); - npyv_@sfx@ u2 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2)); - npyv_@sfx@ u3 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2)); - npyv_@sfx@ u4 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2)); - npyv_@sfx@ u5 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2)); - - npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0); - npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1); - npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2); - npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3); - npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4); - npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5); - - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5); - } + npy_intp i = 0; + + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]); + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]); + + npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); + npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); + npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); + npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); + npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); + npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); + + npyv_@sfx@ m0 = V_INTRIN(v0, u0); + npyv_@sfx@ m1 = V_INTRIN(v1, u1); + npyv_@sfx@ m2 = V_INTRIN(v2, u2); + npyv_@sfx@ m3 = V_INTRIN(v3, u3); + npyv_@sfx@ m4 = V_INTRIN(v4, u4); + npyv_@sfx@ m5 = V_INTRIN(v5, u5); + + npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0); + npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1); + npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2); + npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3); + npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4); + npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5); } - - return i; -} - -static inline npy_intp -scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) -{ - char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; - npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2]; - npy_intp n = dimensions[0]; - - npy_intp elemPerLoop = 4; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - /* Note, we can't just load all, do all ops, then store all here. - * Sometimes ufuncs are called with `accumulate`, which makes the - * assumption that previous iterations have finished before next - * iteration. For example, the output of iteration 2 depends on the - * result of iteration 1. - */ - - /**begin repeat2 - * #unroll = 0, 1, 2, 3# - */ - @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1)); - @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2)); - *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@); - /**end repeat2**/ - } - - return i; -} - -static inline void -run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - BINARY_DEFS - - i = 0; - - if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) && - nomemoverlap(ip1, is1 * n, op1, os1 * n) && - nomemoverlap(ip2, is2 * n, op1, os1 * n)) - { - // SIMD - do as many iterations as we can with vectors - i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i); - } else { - // Unrolled scalar - do as many iterations as we can with unrolled loops - i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i); - } - // Scalar - finish up any remaining iterations - ip1 += is1 * i; - ip2 += is2 * i; - op1 += os1 * i; - for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){ - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; + for(; i<len; ++i){ + const npyv_lanetype_@sfx@ in1 = ip1[i]; + const npyv_lanetype_@sfx@ in2 = ip2[i]; - *((@type@ *)op1) = SCALAR_OP(in1, in2); + op1[i] = SCALAR_OP(in1, in2); } } -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - run_reduce_@TYPE@_@kind@(args, dimensions, steps); - } else { - run_binary_@TYPE@_@kind@(args, dimensions, steps); - } -} +#undef V_INTRIN +#undef V_REDUCE_INTRIN -#undef SCALAR_OP +#endif // simd_chk && (!fp_only || (is_fp && fp_only)) +#undef SCALAR_OP /**end repeat1**/ - -#endif // @HAVE_NEON_IMPL@ - /**end repeat**/ - -//------------------------------------------------------------------------------ -//-- Float -//------------------------------------------------------------------------------ - +/******************************************************************************* + ** Defining ufunc inner functions + ******************************************************************************/ /**begin repeat - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #type = npy_float, npy_double, npy_longdouble# - * #sfx = f32, f64, f64# - * #asmType = s, d, d# - * #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * + * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * float, double, long double# + * + * #is_fp = 0*10, 1*3# + * #is_unsigend = 1*5, 0*5, 0*3# + * #scalar_sfx = i*10, f, d, l# */ - -// Implementation below assumes longdouble is 64-bit. -#if @HAVE_NEON_IMPL@ - +#undef TO_SIMD_SFX +#if 0 /**begin repeat1 -* Arithmetic -* # kind = maximum, minimum, fmax, fmin# -* # OP = >=, <=, >=, <=# -* # vop = max, min, maxp, minp# -* # asmInstr = fmax, fmin, fmaxnm, fminnm# -* # PROPAGATE_NAN = 1, 1, 0, 0# -*/ - -#define SCALAR_OP @TYPE@_scalar_@vop@ -static inline @type@ SCALAR_OP(@type@ a, @type@ b){ -#ifdef __aarch64__ - @type@ result = 0; - __asm( - "@asmInstr@ %@asmType@[result], %@asmType@[a], %@asmType@[b]" - : [result] "=w" (result) - : [a] "w" (a), [b] "w" (b) - ); - return result; -#else - -#if @PROPAGATE_NAN@ - return ((a @OP@ b || npy_isnan(a)) ? a : b); -#else - return ((a @OP@ b || npy_isnan(b)) ? a : b); + * #len = 8, 16, 32, 64# + */ +#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ + #if @is_fp@ + #define TO_SIMD_SFX(X) X##_f@len@ + #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 + #undef TO_SIMD_SFX + #endif + #elif @is_unsigend@ + #define TO_SIMD_SFX(X) X##_u@len@ + #else + #define TO_SIMD_SFX(X) X##_s@len@ + #endif +/**end repeat1**/ #endif -#endif // __aarch64__ -} -static inline npy_intp -simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - @type@ *ip2 = (@type@ *)args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - - const int vectorsPerLoop = 8; - const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); - npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; - - // SIMD if possible - if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ - npyv_@sfx@ m0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); - npyv_@sfx@ m1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); - npyv_@sfx@ m2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); - npyv_@sfx@ m3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); - npyv_@sfx@ m4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); - npyv_@sfx@ m5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); - npyv_@sfx@ m6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]); - npyv_@sfx@ m7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]); +/**begin repeat1 + * # kind = maximum, minimum, fmax, fmin# + * # intrin = max, min, maxp, minp# + * # fp_only = 0, 0, 1, 1# + */ +#if !@fp_only@ || (@is_fp@ && @fp_only@) +#define SCALAR_OP scalar_@intrin@_@scalar_sfx@ - i += elemPerLoop; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - npyv_@sfx@ v0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); - npyv_@sfx@ v1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); - npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); - npyv_@sfx@ v3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); - npyv_@sfx@ v4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); - npyv_@sfx@ v5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); - npyv_@sfx@ v6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]); - npyv_@sfx@ v7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]); - - m0 = npyv_@vop@_@sfx@(m0, v0); - m1 = npyv_@vop@_@sfx@(m1, v1); - m2 = npyv_@vop@_@sfx@(m2, v2); - m3 = npyv_@vop@_@sfx@(m3, v3); - m4 = npyv_@vop@_@sfx@(m4, v4); - m5 = npyv_@vop@_@sfx@(m5, v5); - m6 = npyv_@vop@_@sfx@(m6, v6); - m7 = npyv_@vop@_@sfx@(m7, v7); +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], + len = dimensions[0]; + npy_intp i = 0; +#ifdef TO_SIMD_SFX + #undef STYPE + #define STYPE TO_SIMD_SFX(npyv_lanetype) + if (IS_BINARY_REDUCE) { + // reduce and contiguous + if (is2 == sizeof(@type@)) { + TO_SIMD_SFX(simd_reduce_c_@intrin@)( + (STYPE*)ip2, (STYPE*)op1, len + ); + goto clear_fp; } - - m0 = npyv_@vop@_@sfx@(m0, m1); - m2 = npyv_@vop@_@sfx@(m2, m3); - m4 = npyv_@vop@_@sfx@(m4, m5); - m6 = npyv_@vop@_@sfx@(m6, m7); - - m0 = npyv_@vop@_@sfx@(m0, m2); - m4 = npyv_@vop@_@sfx@(m4, m6); - - m0 = npyv_@vop@_@sfx@(m0, m4); - - @type@ r = npyv_reduce_@vop@_@sfx@(m0); - - io1 = SCALAR_OP(io1, r); } - - *((@type@ *)iop1) = io1; - - return i; -} - -static inline npy_intp -scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - char *ip2 = args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - - // 8x scalar - npy_intp elemPerLoop = 8; - if((i+elemPerLoop) <= n){ - @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); - @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); - @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); - @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); - @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); - @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); - @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); - @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); - - i += elemPerLoop; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); - @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); - @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); - @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); - @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); - @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); - @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); - @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); - - m0 = SCALAR_OP(m0, v0); - m1 = SCALAR_OP(m1, v1); - m2 = SCALAR_OP(m2, v2); - m3 = SCALAR_OP(m3, v3); - m4 = SCALAR_OP(m4, v4); - m5 = SCALAR_OP(m5, v5); - m6 = SCALAR_OP(m6, v6); - m7 = SCALAR_OP(m7, v7); + else if (!is_mem_overlap(ip1, is1, op1, os1, len) && + !is_mem_overlap(ip2, is2, op1, os1, len) + ) { + // no overlap and operands are binary contiguous + if (IS_BINARY_CONT(@type@, @type@)) { + TO_SIMD_SFX(simd_binary_ccc_@intrin@)( + (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len + ); + goto clear_fp; } - - m0 = SCALAR_OP(m0, m1); - m2 = SCALAR_OP(m2, m3); - m4 = SCALAR_OP(m4, m5); - m6 = SCALAR_OP(m6, m7); - - m0 = SCALAR_OP(m0, m2); - m4 = SCALAR_OP(m4, m6); - - m0 = SCALAR_OP(m0, m4); - - io1 = SCALAR_OP(io1, m0); } - - *((@type@ *)iop1) = io1; - - return i; -} - -static inline void -run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - @type@ *iop1 = (@type@ *)args[0]; - @type@ io1 = iop1[0]; - char *ip2 = args[1]; - npy_intp is2 = steps[1]; - npy_intp n = dimensions[0]; - npy_intp i; - - const int vectorsPerLoop = 8; - const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); - npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; - - i = 0; - - if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ - // SIMD - do as many iterations as we can with vectors - i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i); +#endif // TO_SIMD_SFX +#ifndef NPY_DISABLE_OPTIMIZATION + // scalar unrolls + if (IS_BINARY_REDUCE) { + // Note, 8x unroll was chosen for best results on Apple M1 + npy_intp elemPerLoop = 8; + if((i+elemPerLoop) <= len){ + @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + m0 = SCALAR_OP(m0, v0); + m1 = SCALAR_OP(m1, v1); + m2 = SCALAR_OP(m2, v2); + m3 = SCALAR_OP(m3, v3); + m4 = SCALAR_OP(m4, v4); + m5 = SCALAR_OP(m5, v5); + m6 = SCALAR_OP(m6, v6); + m7 = SCALAR_OP(m7, v7); + } + + m0 = SCALAR_OP(m0, m1); + m2 = SCALAR_OP(m2, m3); + m4 = SCALAR_OP(m4, m5); + m6 = SCALAR_OP(m6, m7); + + m0 = SCALAR_OP(m0, m2); + m4 = SCALAR_OP(m4, m6); + + m0 = SCALAR_OP(m0, m4); + + *((@type@ *)op1) = SCALAR_OP(*((@type@ *)op1), m0); + } } else{ - // Unrolled scalar - do as many iterations as we can with unrolled loops - i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i); - } - - // Scalar - finish up any remaining iterations - io1 = iop1[0]; - ip2 += i * is2; - for(; i<n; ++i, ip2 += is2){ - const @type@ in2 = *(@type@ *)ip2; - io1 = SCALAR_OP(io1, in2); - } - *((@type@ *)iop1) = io1; -} - -static inline npy_intp -simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) -{ - char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; - npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2]; - npy_intp n = dimensions[0]; - - const int vectorsPerLoop = 6; - const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); - int elemPerLoop = vectorsPerLoop * elemPerVector; - - if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){ - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - npyv_@sfx@ v0 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1)); - npyv_@sfx@ v1 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1)); - npyv_@sfx@ v2 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1)); - npyv_@sfx@ v3 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1)); - npyv_@sfx@ v4 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1)); - npyv_@sfx@ v5 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1)); - - npyv_@sfx@ u0 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2)); - npyv_@sfx@ u1 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2)); - npyv_@sfx@ u2 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2)); - npyv_@sfx@ u3 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2)); - npyv_@sfx@ u4 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2)); - npyv_@sfx@ u5 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2)); - - npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0); - npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1); - npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2); - npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3); - npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4); - npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5); - - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4); - npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5); + // Note, 4x unroll was chosen for best results on Apple M1 + npy_intp elemPerLoop = 4; + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + /* Note, we can't just load all, do all ops, then store all here. + * Sometimes ufuncs are called with `accumulate`, which makes the + * assumption that previous iterations have finished before next + * iteration. For example, the output of iteration 2 depends on the + * result of iteration 1. + */ + + /**begin repeat2 + * #unroll = 0, 1, 2, 3# + */ + @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1)); + @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2)); + *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@); + /**end repeat2**/ } } - - return i; -} - -static inline npy_intp -scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) -{ - char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; - npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2]; - npy_intp n = dimensions[0]; - - npy_intp elemPerLoop = 4; - for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ - /* Note, we can't just load all, do all ops, then store all here. - * Sometimes ufuncs are called with `accumulate`, which makes the - * assumption that previous iterations have finished before next - * iteration. For example, the output of iteration 2 depends on the - * result of iteration 1. - */ - - /**begin repeat2 - * #unroll = 0, 1, 2, 3# - */ - @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1)); - @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2)); - *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@); - /**end repeat2**/ - } - - return i; -} - -static inline void -run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - BINARY_DEFS - - i = 0; - - if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) && - nomemoverlap(ip1, is1 * n, op1, os1 * n) && - nomemoverlap(ip2, is2 * n, op1, os1 * n)) - { - // SIMD - do as many iterations as we can with vectors - i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i); - } else { - // Unrolled scalar - do as many iterations as we can with unrolled loops - i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i); - } - - // Scalar - finish up any remaining iterations +#endif // NPY_DISABLE_OPTIMIZATION ip1 += is1 * i; ip2 += is2 * i; op1 += os1 * i; - for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){ + for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = SCALAR_OP(in1, in2); } -} - - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - run_reduce_@TYPE@_@kind@(args, dimensions, steps); - } else { - run_binary_@TYPE@_@kind@(args, dimensions, steps); - } - +#ifdef TO_SIMD_SFX +clear_fp:; +#endif +#if @is_fp@ npy_clear_floatstatus_barrier((char*)dimensions); +#endif } #undef SCALAR_OP +#endif // !fp_only || (is_fp && fp_only) /**end repeat1**/ - -#endif // @HAVE_NEON_IMPL@ - /**end repeat**/ -#endif // NPY_HAVE_NEON |