diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 6 | ||||
-rw-r--r-- | numpy/core/setup.py | 1 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 80 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 38 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src.orig | 717 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_minmax.dispatch.c.src | 472 |
6 files changed, 1231 insertions, 83 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 292d9e0d3..dc71fc5c9 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -516,14 +516,14 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.maximum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj, simd=[('avx512f', 'fd')]), + TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]), TD(O, f='npy_ObjectMax') ), 'minimum': Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.minimum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj, simd=[('avx512f', 'fd')]), + TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]), TD(O, f='npy_ObjectMin') ), 'clip': @@ -537,6 +537,7 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmax'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD('fdg', dispatch=[('loops_minmax', 'fdg')]), TD(noobj), TD(O, f='npy_ObjectMax') ), @@ -544,6 +545,7 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmin'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD('fdg', dispatch=[('loops_minmax', 'fdg')]), TD(noobj), TD(O, f='npy_ObjectMin') ), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index a5f423d8f..1ec178445 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -976,6 +976,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), + join('src', 'umath', 'loops_minmax.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index aaa694f34..6076e0b2d 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -724,32 +724,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void /**end repeat1**/ -/**begin repeat1 - * #kind = maximum, minimum# - * #OP = >, <# - **/ - -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - io1 = (io1 @OP@ in2) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2; - } - } -} - -/**end repeat1**/ - NPY_NO_EXPORT void @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -1715,60 +1689,6 @@ NPY_NO_EXPORT void } npy_clear_floatstatus_barrier((char*)dimensions); } - -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* */ - if (IS_BINARY_REDUCE) { - if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - } - else { - BINARY_LOOP { - @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2; - *((@type@ *)op1) = in1; - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} -/**end repeat1**/ - -/**begin repeat1 - * #kind = fmax, fmin# - * #OP = >=, <=# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* */ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2; - } - *((@type@ *)iop1) = io1; - } - else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* Order of operations important for MSVC 2015 */ - *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2; - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} /**end repeat1**/ NPY_NO_EXPORT void diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 081ca9957..3eafbdf66 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -22,7 +22,6 @@ #define BOOL_fmax BOOL_maximum #define BOOL_fmin BOOL_minimum - /* ***************************************************************************** ** BOOLEAN LOOPS ** @@ -660,6 +659,43 @@ PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, vo /* ***************************************************************************** + ** MIN/MAX LOOPS ** + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_minmax.dispatch.h" +#endif + +//---------- Integers ---------- + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + */ +/**begin repeat1 + * #kind = maximum, minimum# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +//---------- Float ---------- + + /**begin repeat + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + */ +/**begin repeat1 + * #kind = maximum, minimum, fmax, fmin# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +/* + ***************************************************************************** ** END LOOPS ** ***************************************************************************** */ diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig new file mode 100644 index 000000000..aab1b5bbb --- /dev/null +++ b/numpy/core/src/umath/loops.h.src.orig @@ -0,0 +1,717 @@ +/* -*- c -*- */ +/* + * vim:syntax=c + */ + +#ifndef _NPY_UMATH_LOOPS_H_ +#define _NPY_UMATH_LOOPS_H_ + +#ifndef NPY_NO_EXPORT + #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#define BOOL_invert BOOL_logical_not +#define BOOL_add BOOL_logical_or +#define BOOL_bitwise_and BOOL_logical_and +#define BOOL_bitwise_or BOOL_logical_or +#define BOOL_logical_xor BOOL_not_equal +#define BOOL_bitwise_xor BOOL_logical_xor +#define BOOL_multiply BOOL_logical_and +#define BOOL_maximum BOOL_logical_or +#define BOOL_minimum BOOL_logical_and +#define BOOL_fmax BOOL_maximum +#define BOOL_fmin BOOL_minimum + +/* + * The ARM NEON implementation of min/max for longdouble, longlong, and + * ulonglong assume that they're all 64-bits. If that's not true, then we'll + * use the scalar implementations instead. + */ +#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64) +#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64) + +/* + ***************************************************************************** + ** BOOLEAN LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or, absolute, logical_not# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +NPY_NO_EXPORT void +BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +/* + ***************************************************************************** + ** INTEGER LOOPS + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithmetic.dispatch.h" +#endif + +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + +/**begin repeat + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ + +/**begin repeat1 + * both signed and unsigned integer types + * #s = , u# + * #S = , U# + */ + +#define @S@@TYPE@_floor_divide @S@@TYPE@_divide +#define @S@@TYPE@_fmax @S@@TYPE@_maximum +#define @S@@TYPE@_fmin @S@@TYPE@_minimum + +NPY_NO_EXPORT void +@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #isa = , _avx2# + */ + +NPY_NO_EXPORT void +@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat3 + * Arithmetic + * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, + * left_shift, right_shift# + * #OP = +, -,*, &, |, ^, <<, >># + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +/**begin repeat3 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or# + * #OP = ==, !=, >, >=, <, <=, &&, ||# + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +NPY_NO_EXPORT void +@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**begin repeat2 + * #kind = maximum, minimum# + * #OP = >, <# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +NPY_NO_EXPORT void +@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**end repeat1**/ + +/**end repeat**/ + +/* + ***************************************************************************** + ** FLOAT LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #kind = sqrt, absolute, square, reciprocal# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_umath_fp.dispatch.h" +#endif + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat**/ + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = maximum, minimum# + */ +NPY_NO_EXPORT void +@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_trigonometric.dispatch.h" +#endif +/**begin repeat + * #func = sin, cos# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_exponent_log.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * # kind = exp, log, frexp, ldexp# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = rint, ceil, floor, trunc# + */ + +/**begin repeat1 +* #TYPE = FLOAT, DOUBLE# +*/ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat2 + * #isa = avx512f, fma# + */ +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); +/**end repeat2**/ +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * Float types + * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, f, , l# + * #C = F, F, , L# + */ + + +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal, + * logical_and, logical_or# + * #OP = ==, !=, <, <=, >, >=, &&, ||# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing# + * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing# + **/ + +/**begin repeat2 + * #ISA = , _avx512_skx# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + + +/* + ***************************************************************************** + ** COMPLEX LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + */ +/**begin repeat1 + * #kind = add, subtract, multiply# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi)); +#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi)); +#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi)); +#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi)); +#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi); +#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi); + +/**begin repeat + * complex types + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, , l# + * #C = F, , L# + */ + +/**begin repeat1 + * arithmetic + * #kind = add, subtract, multiply# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind= greater, greater_equal, less, less_equal, equal, not_equal# + * #OP = CGT, CGE, CLT, CLE, CEQ, CNE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + #kind = logical_and, logical_or# + #OP1 = ||, ||# + #OP2 = &&, ||# +*/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**begin repeat1 + * #kind = isnan, isinf, isfinite# + * #func = npy_isnan, npy_isinf, npy_isfinite# + * #OP = ||, ||, &&# + **/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #isa = , _avx512f# + */ + +NPY_NO_EXPORT void +C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +#undef CGE +#undef CLE +#undef CGT +#undef CLT +#undef CEQ +#undef CNE + +/* + ***************************************************************************** + ** DATETIME LOOPS ** + ***************************************************************************** + */ + +NPY_NO_EXPORT void +TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat + * #TYPE = DATETIME, TIMEDELTA# + */ + +NPY_NO_EXPORT void +@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +#define @TYPE@_isnan @TYPE@_isnat + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = ==, !=, >, >=, <, <=# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum, fmin, fmax# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +NPY_NO_EXPORT void +DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/* Special case equivalents to above functions */ +#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide +#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide +/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ + +/* + ***************************************************************************** + ** OBJECT LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = EQ, NE, GT, GE, LT, LE# + */ +/**begin repeat1 + * #suffix = , _OO_O# + */ +NPY_NO_EXPORT void +OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ +/**end repeat**/ + +NPY_NO_EXPORT void +OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func); + +/* + ***************************************************************************** + ** MIN/MAX LOOPS ** + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_minmax.dispatch.h" +#endif + +//---------- Integers ---------- + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# + */ +#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + +//---------- Float ---------- + + /**begin repeat + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE# + */ + #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum, fmax, fmin# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + +/* + ***************************************************************************** + ** END LOOPS ** + ***************************************************************************** + */ + +#endif diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src new file mode 100644 index 000000000..c2eddcee6 --- /dev/null +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -0,0 +1,472 @@ +/*@targets + ** $maxopt baseline + ** neon asimd + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/******************************************************************************* + ** Scalar intrinsics + ******************************************************************************/ +// signed/unsigned int +#define scalar_max_i(A, B) ((A > B) ? A : B) +#define scalar_min_i(A, B) ((A < B) ? A : B) +// fp, propagates NaNs +#if !defined(__aarch64__) +#define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B) +#define scalar_max_d scalar_max_f +#define scalar_max_l scalar_max_f +#define scalar_min_f(A, B) ((A <= B || npy_isnan(A)) ? A : B) +#define scalar_min_d scalar_min_f +#define scalar_min_l scalar_min_f +// fp, ignores NaNs +#define scalar_maxp_f fmaxf +#define scalar_maxp_d fmax +#define scalar_maxp_l fmaxl +#define scalar_minp_f fminf +#define scalar_minp_d fmin +#define scalar_minp_l fminl +#elif defined(__aarch64__) +/**begin repeat + * #type = npy_float, npy_double, npy_longdouble# + * #scalar_sfx = f, d, l# + * #asm_sfx = s, d, d# + */ +/**begin repeat1 + * #op = max, min, maxp, minp# + * #asm_instr = fmax, fmin, fmaxnm, fminnm# + */ +static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ + @type@ result = 0; + __asm( + "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]" + : [result] "=w" (result) + : [a] "w" (a), [b] "w" (b) + ); + return result; +} +/**end repeat1**/ +/**end repeat**/ +#endif // !defined(__aarch64__) + +/******************************************************************************* + ** extra SIMD intrinsics + ******************************************************************************/ + +#if NPY_SIMD +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64# + * #is_64 = 0*6, 1*2# + */ +#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__) + #if !@is_64@ + #define npyv_reduce_min_@sfx@ vminvq_@sfx@ + #define npyv_reduce_max_@sfx@ vmaxvq_@sfx@ + #else + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_min_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0); + npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1); + npyv_lanetype_@sfx@ result = (a < b) ? a : b; + return result; + } + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_max_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0); + npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1); + npyv_lanetype_@sfx@ result = (a > b) ? a : b; + return result; + } + #endif // !@is_64@ +#else + /**begin repeat1 + * #intrin = min, max# + */ + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_storea_@sfx@(s, v); + npyv_lanetype_@sfx@ result = s[0]; + for(int i=1; i<npyv_nlanes_@sfx@; ++i){ + result = scalar_@intrin@_i(result, s[i]); + } + return result; + } + /**end repeat1**/ +#endif +/**end repeat**/ +#endif // NPY_SIMD + +/**begin repeat + * #sfx = f32, f64# + * #bsfx = b32, b64# + * #simd_chk = NPY_SIMD, NPY_SIMD_F64# + * #scalar_sfx = f, d# + */ +#if @simd_chk@ +#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__) + #define npyv_minn_@sfx@ vminq_@sfx@ + #define npyv_maxn_@sfx@ vmaxq_@sfx@ + #define npyv_reduce_minn_@sfx@ vminvq_@sfx@ + #define npyv_reduce_maxn_@sfx@ vmaxvq_@sfx@ + #define npyv_reduce_minp_@sfx@ vminnmvq_@sfx@ + #define npyv_reduce_maxp_@sfx@ vmaxnmvq_@sfx@ +#else + /**begin repeat1 + * #intrin = min, max# + */ + // propagates NaNs + NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b) + { + npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b); + result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b); + result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a); + return result; + } + /**end repeat1**/ + /**begin repeat1 + * #intrin = minn, maxn, minp, maxp# + * #scalar_intrin = min, max, minp, maxp# + */ + NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) + { + npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_storea_@sfx@(s, v); + npyv_lanetype_@sfx@ result = s[0]; + for(int i=1; i<npyv_nlanes_@sfx@; ++i){ + result = scalar_@scalar_intrin@_@scalar_sfx@(result, s[i]); + } + return result; + } + /**end repeat1**/ +#endif +#endif // simd_chk +/**end repeat**/ + +/******************************************************************************* + ** Defining the SIMD kernels + ******************************************************************************/ + +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64# + * #simd_chk = NPY_SIMD*9, NPY_SIMD_F64# + * #is_fp = 0*8, 1, 1# + * #scalar_sfx = i*8, f, d# + */ +/**begin repeat1 + * # intrin = max, min, maxp, minp# + * # fp_only = 0, 0, 1, 1# + */ +#define SCALAR_OP scalar_@intrin@_@scalar_sfx@ +#if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@)) +#if @is_fp@ && !@fp_only@ + #define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs + #define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@ +#else + #define V_INTRIN npyv_@intrin@_@sfx@ + #define V_REDUCE_INTRIN npyv_reduce_@intrin@_@sfx@ +#endif + +// contiguous input. +static inline void +simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len) +{ + // Note, 8x unroll was chosen for best results on Apple M1 + const int vectorsPerLoop = 8; + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@); + npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + + npy_intp i = 0; + npyv_lanetype_@sfx@ io1 = op1[0]; + + // SIMD if possible + if((i+elemPerLoop) <= len){ + npyv_@sfx@ m0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]); + npyv_@sfx@ m1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]); + npyv_@sfx@ m2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]); + npyv_@sfx@ m3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]); + npyv_@sfx@ m4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]); + npyv_@sfx@ m5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]); + npyv_@sfx@ m6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]); + npyv_@sfx@ m7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]); + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]); + npyv_@sfx@ v6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]); + npyv_@sfx@ v7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]); + + m0 = V_INTRIN(m0, v0); + m1 = V_INTRIN(m1, v1); + m2 = V_INTRIN(m2, v2); + m3 = V_INTRIN(m3, v3); + m4 = V_INTRIN(m4, v4); + m5 = V_INTRIN(m5, v5); + m6 = V_INTRIN(m6, v6); + m7 = V_INTRIN(m7, v7); + } + + m0 = V_INTRIN(m0, m1); + m2 = V_INTRIN(m2, m3); + m4 = V_INTRIN(m4, m5); + m6 = V_INTRIN(m6, m7); + + m0 = V_INTRIN(m0, m2); + m4 = V_INTRIN(m4, m6); + + m0 = V_INTRIN(m0, m4); + + npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(m0); + + io1 = SCALAR_OP(io1, r); + } + + // Scalar - finish up any remaining iterations + for(; i<len; ++i){ + const npyv_lanetype_@sfx@ in2 = ip[i]; + io1 = SCALAR_OP(io1, in2); + } + op1[0] = io1; +} + +// contiguous inputs and output. +static inline void +simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2, + npyv_lanetype_@sfx@ *op1, npy_intp len) +{ + // Note, 6x unroll was chosen for best results on Apple M1 + const int vectorsPerLoop = 6; + const int elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@); + int elemPerLoop = vectorsPerLoop * elemPerVector; + + npy_intp i = 0; + + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]); + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]); + + npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); + npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); + npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); + npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); + npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); + npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); + + npyv_@sfx@ m0 = V_INTRIN(v0, u0); + npyv_@sfx@ m1 = V_INTRIN(v1, u1); + npyv_@sfx@ m2 = V_INTRIN(v2, u2); + npyv_@sfx@ m3 = V_INTRIN(v3, u3); + npyv_@sfx@ m4 = V_INTRIN(v4, u4); + npyv_@sfx@ m5 = V_INTRIN(v5, u5); + + npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0); + npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1); + npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2); + npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3); + npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4); + npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5); + } + + // Scalar - finish up any remaining iterations + for(; i<len; ++i){ + const npyv_lanetype_@sfx@ in1 = ip1[i]; + const npyv_lanetype_@sfx@ in2 = ip2[i]; + + op1[i] = SCALAR_OP(in1, in2); + } +} + +#undef V_INTRIN +#undef V_REDUCE_INTRIN + +#endif // simd_chk && (!fp_only || (is_fp && fp_only)) + +#undef SCALAR_OP +/**end repeat1**/ +/**end repeat**/ + +/******************************************************************************* + ** Defining ufunc inner functions + ******************************************************************************/ +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * + * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * float, double, long double# + * + * #is_fp = 0*10, 1*3# + * #is_unsigend = 1*5, 0*5, 0*3# + * #scalar_sfx = i*10, f, d, l# + */ +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ + #if @is_fp@ + #define TO_SIMD_SFX(X) X##_f@len@ + #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 + #undef TO_SIMD_SFX + #endif + #elif @is_unsigend@ + #define TO_SIMD_SFX(X) X##_u@len@ + #else + #define TO_SIMD_SFX(X) X##_s@len@ + #endif +/**end repeat1**/ +#endif + +/**begin repeat1 + * # kind = maximum, minimum, fmax, fmin# + * # intrin = max, min, maxp, minp# + * # fp_only = 0, 0, 1, 1# + */ +#if !@fp_only@ || (@is_fp@ && @fp_only@) +#define SCALAR_OP scalar_@intrin@_@scalar_sfx@ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], + len = dimensions[0]; + npy_intp i = 0; +#ifdef TO_SIMD_SFX + #undef STYPE + #define STYPE TO_SIMD_SFX(npyv_lanetype) + if (IS_BINARY_REDUCE) { + // reduce and contiguous + if (is2 == sizeof(@type@)) { + TO_SIMD_SFX(simd_reduce_c_@intrin@)( + (STYPE*)ip2, (STYPE*)op1, len + ); + goto clear_fp; + } + } + else if (!is_mem_overlap(ip1, is1, op1, os1, len) && + !is_mem_overlap(ip2, is2, op1, os1, len) + ) { + // no overlap and operands are binary contiguous + if (IS_BINARY_CONT(@type@, @type@)) { + TO_SIMD_SFX(simd_binary_ccc_@intrin@)( + (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len + ); + goto clear_fp; + } + } +#endif // TO_SIMD_SFX +#ifndef NPY_DISABLE_OPTIMIZATION + // scalar unrolls + if (IS_BINARY_REDUCE) { + // Note, 8x unroll was chosen for best results on Apple M1 + npy_intp elemPerLoop = 8; + if((i+elemPerLoop) <= len){ + @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + m0 = SCALAR_OP(m0, v0); + m1 = SCALAR_OP(m1, v1); + m2 = SCALAR_OP(m2, v2); + m3 = SCALAR_OP(m3, v3); + m4 = SCALAR_OP(m4, v4); + m5 = SCALAR_OP(m5, v5); + m6 = SCALAR_OP(m6, v6); + m7 = SCALAR_OP(m7, v7); + } + + m0 = SCALAR_OP(m0, m1); + m2 = SCALAR_OP(m2, m3); + m4 = SCALAR_OP(m4, m5); + m6 = SCALAR_OP(m6, m7); + + m0 = SCALAR_OP(m0, m2); + m4 = SCALAR_OP(m4, m6); + + m0 = SCALAR_OP(m0, m4); + + *((@type@ *)op1) = SCALAR_OP(*((@type@ *)op1), m0); + } + } else{ + // Note, 4x unroll was chosen for best results on Apple M1 + npy_intp elemPerLoop = 4; + for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ + /* Note, we can't just load all, do all ops, then store all here. + * Sometimes ufuncs are called with `accumulate`, which makes the + * assumption that previous iterations have finished before next + * iteration. For example, the output of iteration 2 depends on the + * result of iteration 1. + */ + + /**begin repeat2 + * #unroll = 0, 1, 2, 3# + */ + @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1)); + @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2)); + *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@); + /**end repeat2**/ + } + } +#endif // NPY_DISABLE_OPTIMIZATION + ip1 += is1 * i; + ip2 += is2 * i; + op1 += os1 * i; + for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((@type@ *)op1) = SCALAR_OP(in1, in2); + } +#ifdef TO_SIMD_SFX +clear_fp:; +#endif +#if @is_fp@ + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} + +#undef SCALAR_OP + +#endif // !fp_only || (is_fp && fp_only) +/**end repeat1**/ +/**end repeat**/ + |