summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2023-02-03 12:44:03 +0200
committerSayed Adel <seiko@imavr.com>2023-02-20 05:33:57 +0200
commit3d63e186087fbcf78764b822868870a09182a117 (patch)
tree9a0c1a905da6bfc4c847629bad02399cc5f9158d
parent866f41a85bddfa3ea6de551bb27f335b0f8a6a52 (diff)
downloadnumpy-3d63e186087fbcf78764b822868870a09182a117.tar.gz
ENH, SIMD: move auto-vectorized inner functions to new dispatchable source
-rw-r--r--.gitignore1
-rw-r--r--numpy/core/meson.build1
-rw-r--r--numpy/core/src/umath/fast_loop_macros.h17
-rw-r--r--numpy/core/src/umath/loops.c.src159
-rw-r--r--numpy/core/src/umath/loops.h.src61
-rw-r--r--numpy/core/src/umath/loops_autovec_int.dispatch.c.src138
6 files changed, 182 insertions, 195 deletions
diff --git a/.gitignore b/.gitignore
index 48fde77fc..c15a486d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -229,6 +229,7 @@ numpy/core/src/umath/loops_hyperbolic.dispatch.c
numpy/core/src/umath/loops_modulo.dispatch.c
numpy/core/src/umath/loops_comparison.dispatch.c
numpy/core/src/umath/loops_unary_complex.dispatch.c
+numpy/core/src/umath/loops_autovec.dispatch.c
# multiarray module
numpy/core/src/multiarray/argfunc.dispatch.c
numpy/core/src/multiarray/arraytypes.h
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 84af05ff4..2b24f12ff 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -744,6 +744,7 @@ src_umath = [
src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
+ src_file.process('src/umath/loops_autovec_int.dispatch.c.src'),
src_file.process('src/umath/matmul.c.src'),
src_file.process('src/umath/matmul.h.src'),
'src/umath/ufunc_type_resolution.c',
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index cade26db4..b8c1926b2 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -12,6 +12,19 @@
#include <assert.h>
+#include "simd/simd.h"
+
+/*
+ * largest simd vector size in bytes numpy supports
+ * it is currently a extremely large value as it is only used for memory
+ * overlap checks
+ */
+#if NPY_SIMD > 0
+ // Enough for compiler unroll
+ #define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4
+#else
+ #define AUTOVEC_OVERLAP_SIZE 1024
+#endif
/*
* MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
* Very large step size can be as slow as processing it using scalar. The
@@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b)
/* condition allows compiler to optimize the generic macro */ \
if (IS_BINARY_CONT(tin, tout)) { \
if (abs_ptrdiff(args[2], args[0]) == 0 && \
- abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
+ abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else if (abs_ptrdiff(args[2], args[1]) == 0 && \
- abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
+ abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else { \
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5684f26a5..a608351d5 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -32,16 +32,6 @@
*/
#define PW_BLOCKSIZE 128
-
-/*
- * largest simd vector size in bytes numpy supports
- * it is currently a extremely large value as it is only used for memory
- * overlap checks
- */
-#ifndef NPY_MAX_SIMD_SIZE
-#define NPY_MAX_SIMD_SIZE 1024
-#endif
-
/** Provides the various *_LOOP macros */
#include "fast_loop_macros.h"
@@ -474,74 +464,15 @@ NPY_NO_EXPORT void
}
/**begin repeat1
- * #isa = , _avx2#
- * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
- * #ATTR = , NPY_GCC_TARGET_AVX2#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
-}
-#endif
-
-/**begin repeat2
* Arithmetic
* #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
* #OP = +, -, *, &, |, ^#
*/
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions,
- npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if (IS_BINARY_REDUCE) {
- BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
- }
- else {
- BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
- }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
-@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
- char **args, npy_intp const *dimensions, npy_intp const *steps,
- void *NPY_UNUSED(func))
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
{
char *ip1 = args[0];
char *indx = args[1];
@@ -556,86 +487,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
}
return 0;
}
-
-#endif
-
-/**end repeat2**/
-
-/*
- * Arithmetic bit shift operations.
- *
- * Intel hardware masks bit shift values, so large shifts wrap around
- * and can produce surprising results. The special handling ensures that
- * behavior is independent of compiler or hardware.
- * TODO: We could implement consistent behavior for negative shifts,
- * which is undefined in C.
- */
-
-#define INT_left_shift_needs_clear_floatstatus
-#define UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
- void *NPY_UNUSED(func))
-{
- BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
-
-#ifdef @TYPE@_left_shift_needs_clear_floatstatus
- // For some reason, our macOS CI sets an "invalid" flag here, but only
- // for some types.
- npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-#endif
-
-#undef INT_left_shift_needs_clear_floatstatus
-#undef UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT
-#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
-NPY_GCC_OPT_3
-#endif
-void
-@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
- void *NPY_UNUSED(func))
-{
- BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
-}
-#endif
-
-/**begin repeat2
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- /*
- * gcc vectorization of this is not good (PR60575) but manual integer
- * vectorization is too tedious to be worthwhile
- */
- BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
-}
-#endif
-
-/**end repeat2**/
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- BINARY_LOOP {
- const int t1 = !!*(@type@ *)ip1;
- const int t2 = !!*(@type@ *)ip2;
- *((npy_bool *)op1) = (t1 != t2);
- }
-}
-#endif
-
/**end repeat1**/
NPY_NO_EXPORT void
@@ -1714,7 +1565,7 @@ HALF_@kind@_indexed(void *NPY_UNUSED(context),
const float v = npy_half_to_float(*(npy_half *)value);
*indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v);
}
- return 0;
+ return 0;
}
/**end repeat**/
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index e393b8310..064f76980 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -123,6 +123,25 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
/**end repeat1**/
/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_autovec_int.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+/**begin repeat1
+ * #kind = invert, logical_not, conjugate, reciprocal, square, add,
+ * subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ * left_shift, right_shift, logical_and, logical_or,
+ * logical_xor#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
/**begin repeat
* #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
*/
@@ -132,7 +151,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
* #s = , u#
* #S = , U#
*/
-
#define @S@@TYPE@_floor_divide @S@@TYPE@_divide
#define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed
#define @S@@TYPE@_fmax @S@@TYPE@_maximum
@@ -147,49 +165,15 @@ NPY_NO_EXPORT void
@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**begin repeat2
- * #isa = , _avx2#
- */
-
-NPY_NO_EXPORT void
-@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat3
* Arithmetic
* #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
* left_shift, right_shift#
* #OP = +, -,*, &, |, ^, <<, >>#
*/
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
NPY_NO_EXPORT int
-@S@@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+ npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
-/**end repeat3**/
-
-/**begin repeat3
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat3**/
-
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat2**/
/**begin repeat2
@@ -217,6 +201,7 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
/**begin repeat2
* #kind = isnan, isinf, isfinite#
@@ -224,9 +209,7 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat2**/
-
/**end repeat1**/
-
/**end repeat**/
diff --git a/numpy/core/src/umath/loops_autovec_int.dispatch.c.src b/numpy/core/src/umath/loops_autovec_int.dispatch.c.src
new file mode 100644
index 000000000..3ad812333
--- /dev/null
+++ b/numpy/core/src/umath/loops_autovec_int.dispatch.c.src
@@ -0,0 +1,138 @@
+/*@targets
+ ** $maxopt $autovec baseline
+ ** sse2 avx2 avx512_skx
+ ** neon asimd
+ ** vsx2 vsx3
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+
+/*
+ * Arithmetic bit shift operations.
+ *
+ * Intel hardware masks bit shift values, so large shifts wrap around
+ * and can produce surprising results. The special handling ensures that
+ * behavior is independent of compiler or hardware.
+ * TODO: We could implement consistent behavior for negative shifts,
+ * which is undefined in C.
+ */
+#define INT_left_shift_needs_clear_floatstatus
+#define UINT_left_shift_needs_clear_floatstatus
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ * LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ * npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
+ * npy_double, npy_double, npy_double, npy_double#
+ * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
+ * #OP = +, -, *, &, |, ^#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if (IS_BINARY_REDUCE) {
+ BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+ }
+ else {
+ BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+ }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
+{
+ BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+ // For some reason, our macOS CI sets an "invalid" flag here, but only
+ // for some types.
+ npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+ BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+#else
+ BINARY_LOOP {
+ @type@ in1 = *(@type@ *)ip1;
+ @type@ in2 = *(@type@ *)ip2;
+ *(@type@ *)op1 = npy_rshift@c@(in1, in2);
+ }
+#endif
+}
+
+/**begin repeat1
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ /*
+ * gcc vectorization of this is not good (PR60575) but manual integer
+ * vectorization is too tedious to be worthwhile
+ */
+ BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ BINARY_LOOP {
+ const int t1 = !!*(@type@ *)ip1;
+ const int t2 = !!*(@type@ *)ip2;
+ *((npy_bool *)op1) = (t1 != t2);
+ }
+}
+/**end repeat**/