diff options
| author | Sayed Adel <seiko@imavr.com> | 2023-02-03 12:44:03 +0200 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2023-02-20 05:33:57 +0200 |
| commit | 3d63e186087fbcf78764b822868870a09182a117 (patch) | |
| tree | 9a0c1a905da6bfc4c847629bad02399cc5f9158d | |
| parent | 866f41a85bddfa3ea6de551bb27f335b0f8a6a52 (diff) | |
| download | numpy-3d63e186087fbcf78764b822868870a09182a117.tar.gz | |
ENH, SIMD: move auto-vectorized inner functions to new dispatchable source
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | numpy/core/meson.build | 1 | ||||
| -rw-r--r-- | numpy/core/src/umath/fast_loop_macros.h | 17 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.c.src | 159 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.h.src | 61 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_autovec_int.dispatch.c.src | 138 |
6 files changed, 182 insertions, 195 deletions
diff --git a/.gitignore b/.gitignore index 48fde77fc..c15a486d9 100644 --- a/.gitignore +++ b/.gitignore @@ -229,6 +229,7 @@ numpy/core/src/umath/loops_hyperbolic.dispatch.c numpy/core/src/umath/loops_modulo.dispatch.c numpy/core/src/umath/loops_comparison.dispatch.c numpy/core/src/umath/loops_unary_complex.dispatch.c +numpy/core/src/umath/loops_autovec.dispatch.c # multiarray module numpy/core/src/multiarray/argfunc.dispatch.c numpy/core/src/multiarray/arraytypes.h diff --git a/numpy/core/meson.build b/numpy/core/meson.build index 84af05ff4..2b24f12ff 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -744,6 +744,7 @@ src_umath = [ src_file.process('src/umath/loops_unary_fp.dispatch.c.src'), src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'), src_file.process('src/umath/loops_unary_complex.dispatch.c.src'), + src_file.process('src/umath/loops_autovec_int.dispatch.c.src'), src_file.process('src/umath/matmul.c.src'), src_file.process('src/umath/matmul.h.src'), 'src/umath/ufunc_type_resolution.c', diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h index cade26db4..b8c1926b2 100644 --- a/numpy/core/src/umath/fast_loop_macros.h +++ b/numpy/core/src/umath/fast_loop_macros.h @@ -12,6 +12,19 @@ #include <assert.h> +#include "simd/simd.h" + +/* + * largest simd vector size in bytes numpy supports + * it is currently a extremely large value as it is only used for memory + * overlap checks + */ +#if NPY_SIMD > 0 + // Enough for compiler unroll + #define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4 +#else + #define AUTOVEC_OVERLAP_SIZE 1024 +#endif /* * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc. * Very large step size can be as slow as processing it using scalar. The @@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b) /* condition allows compiler to optimize the generic macro */ \ if (IS_BINARY_CONT(tin, tout)) { \ if (abs_ptrdiff(args[2], args[0]) == 0 && \ - abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \ + abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \ BASE_BINARY_LOOP_INP(tin, tout, op) \ } \ else if (abs_ptrdiff(args[2], args[1]) == 0 && \ - abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \ + abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \ BASE_BINARY_LOOP_INP(tin, tout, op) \ } \ else { \ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 5684f26a5..a608351d5 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -32,16 +32,6 @@ */ #define PW_BLOCKSIZE 128 - -/* - * largest simd vector size in bytes numpy supports - * it is currently a extremely large value as it is only used for memory - * overlap checks - */ -#ifndef NPY_MAX_SIMD_SIZE -#define NPY_MAX_SIMD_SIZE 1024 -#endif - /** Provides the various *_LOOP macros */ #include "fast_loop_macros.h" @@ -474,74 +464,15 @@ NPY_NO_EXPORT void } /**begin repeat1 - * #isa = , _avx2# - * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)# - * #ATTR = , NPY_GCC_TARGET_AVX2# - */ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in * in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = ~in); -} -#endif - -/**begin repeat2 * Arithmetic * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor# * #OP = +, -, *, &, |, ^# */ -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, - npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2); - } - else { - BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); - } -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int -@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), - char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) +NPY_NO_EXPORT NPY_GCC_OPT_3 int +@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) { char *ip1 = args[0]; char *indx = args[1]; @@ -556,86 +487,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int } return 0; } - -#endif - -/**end repeat2**/ - -/* - * Arithmetic bit shift operations. - * - * Intel hardware masks bit shift values, so large shifts wrap around - * and can produce surprising results. The special handling ensures that - * behavior is independent of compiler or hardware. - * TODO: We could implement consistent behavior for negative shifts, - * which is undefined in C. - */ - -#define INT_left_shift_needs_clear_floatstatus -#define UINT_left_shift_needs_clear_floatstatus - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); - -#ifdef @TYPE@_left_shift_needs_clear_floatstatus - // For some reason, our macOS CI sets an "invalid" flag here, but only - // for some types. - npy_clear_floatstatus_barrier((char*)dimensions); -#endif -} -#endif - -#undef INT_left_shift_needs_clear_floatstatus -#undef UINT_left_shift_needs_clear_floatstatus - -#if @CHK@ -NPY_NO_EXPORT -#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift -NPY_GCC_OPT_3 -#endif -void -@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); -} -#endif - -/**begin repeat2 - * #kind = logical_and, logical_or# - * #OP = &&, ||# - */ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* - * gcc vectorization of this is not good (PR60575) but manual integer - * vectorization is too tedious to be worthwhile - */ - BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2); -} -#endif - -/**end repeat2**/ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP { - const int t1 = !!*(@type@ *)ip1; - const int t2 = !!*(@type@ *)ip2; - *((npy_bool *)op1) = (t1 != t2); - } -} -#endif - /**end repeat1**/ NPY_NO_EXPORT void @@ -1714,7 +1565,7 @@ HALF_@kind@_indexed(void *NPY_UNUSED(context), const float v = npy_half_to_float(*(npy_half *)value); *indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v); } - return 0; + return 0; } /**end repeat**/ diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index e393b8310..064f76980 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -123,6 +123,25 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, /**end repeat1**/ /**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_autovec_int.dispatch.h" +#endif +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# + */ +/**begin repeat1 + * #kind = invert, logical_not, conjugate, reciprocal, square, add, + * subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, + * left_shift, right_shift, logical_and, logical_or, + * logical_xor# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ + /**begin repeat * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ @@ -132,7 +151,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, * #s = , u# * #S = , U# */ - #define @S@@TYPE@_floor_divide @S@@TYPE@_divide #define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed #define @S@@TYPE@_fmax @S@@TYPE@_maximum @@ -147,49 +165,15 @@ NPY_NO_EXPORT void @S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**begin repeat2 - * #isa = , _avx2# - */ - -NPY_NO_EXPORT void -@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat3 * Arithmetic * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, * left_shift, right_shift# * #OP = +, -,*, &, |, ^, <<, >># */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - NPY_NO_EXPORT int -@S@@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); +@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, + npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); -/**end repeat3**/ - -/**begin repeat3 - * #kind = logical_and, logical_or# - * #OP = &&, ||# - */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ - -NPY_NO_EXPORT void -@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat2**/ /**begin repeat2 @@ -217,6 +201,7 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ /**begin repeat2 * #kind = isnan, isinf, isfinite# @@ -224,9 +209,7 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat2**/ - /**end repeat1**/ - /**end repeat**/ diff --git a/numpy/core/src/umath/loops_autovec_int.dispatch.c.src b/numpy/core/src/umath/loops_autovec_int.dispatch.c.src new file mode 100644 index 000000000..3ad812333 --- /dev/null +++ b/numpy/core/src/umath/loops_autovec_int.dispatch.c.src @@ -0,0 +1,138 @@ +/*@targets + ** $maxopt $autovec baseline + ** sse2 avx2 avx512_skx + ** neon asimd + ** vsx2 vsx3 + ** vx vxe + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + + +/* + * Arithmetic bit shift operations. + * + * Intel hardware masks bit shift values, so large shifts wrap around + * and can produce surprising results. The special handling ensures that + * behavior is independent of compiler or hardware. + * TODO: We could implement consistent behavior for negative shifts, + * which is undefined in C. + */ +#define INT_left_shift_needs_clear_floatstatus +#define UINT_left_shift_needs_clear_floatstatus + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, + * npy_long, npy_ulong, npy_longlong, npy_ulonglong# + * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double, + * npy_double, npy_double, npy_double, npy_double# + * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0# + * #c = hh,uhh,h,uh,,u,l,ul,ll,ull# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in * in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = ~in); +} + +/**begin repeat1 + * Arithmetic + * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor# + * #OP = +, -, *, &, |, ^# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2); + } + else { + BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); + } +} +/**end repeat1**/ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); +#ifdef @TYPE@_left_shift_needs_clear_floatstatus + // For some reason, our macOS CI sets an "invalid" flag here, but only + // for some types. + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift + BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); +#else + BINARY_LOOP { + @type@ in1 = *(@type@ *)ip1; + @type@ in2 = *(@type@ *)ip2; + *(@type@ *)op1 = npy_rshift@c@(in1, in2); + } +#endif +} + +/**begin repeat1 + * #kind = logical_and, logical_or# + * #OP = &&, ||# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + /* + * gcc vectorization of this is not good (PR60575) but manual integer + * vectorization is too tedious to be worthwhile + */ + BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2); +} +/**end repeat1**/ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP { + const int t1 = !!*(@type@ *)ip1; + const int t2 = !!*(@type@ *)ip2; + *((npy_bool *)op1) = (t1 != t2); + } +} +/**end repeat**/ |
