summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2023-01-01 18:18:02 +0200
committerSayed Adel <seiko@imavr.com>2023-01-29 13:02:39 +0200
commit783661e95a0e92372f2d1969a8c92f0d1e55e101 (patch)
tree0b53cfdd59f17c9f401834f366a5984ff6e558b6
parentc35f97f964a5ba263e5387ae0cc0a4393cb1da43 (diff)
downloadnumpy-783661e95a0e92372f2d1969a8c92f0d1e55e101.tar.gz
ENH, SIMD: Only dispatch AVX2 for arithmetic operations
no performance gain with AVX512 enabled except for absolute
-rw-r--r--.gitignore1
-rw-r--r--numpy/core/code_generators/generate_umath.py3
-rw-r--r--numpy/core/meson.build1
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/src/umath/loops.h.src16
-rw-r--r--numpy/core/src/umath/loops_arithm_fp.dispatch.c.src2
-rw-r--r--numpy/core/src/umath/loops_unary_complex.dispatch.c.src139
7 files changed, 160 insertions, 3 deletions
diff --git a/.gitignore b/.gitignore
index d00441edc..e83c1554b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -227,6 +227,7 @@ numpy/core/src/umath/loops_umath_fp.dispatch.c
numpy/core/src/umath/loops_hyperbolic.dispatch.c
numpy/core/src/umath/loops_modulo.dispatch.c
numpy/core/src/umath/loops_comparison.dispatch.c
+numpy/core/src/umath/loops_unary_complex.dispatch.c
# npysort module
numpy/core/src/npysort/x86-qsort.dispatch.c
numpy/core/src/npysort/x86-qsort.dispatch.*.cpp
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 350dd19c3..bea9b44ee 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -462,7 +462,8 @@ defdict = {
'PyUFunc_AbsoluteTypeResolver',
TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd'),
('loops_logical', '?')]),
- TD(cmplx, dispatch=[('loops_arithm_fp', 'FD')], out=('f', 'd', 'g')),
+ TD(cmplx, dispatch=[('loops_unary_complex', 'FD')],
+ out=('f', 'd', 'g')),
TD(O, f='PyNumber_Absolute'),
),
'_arg':
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 2e349ff5a..7cbfe6dc3 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -759,6 +759,7 @@ src_umath = [
src_file.process('src/umath/loops_unary.dispatch.c.src'),
src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
+ src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
src_file.process('src/umath/matmul.c.src'),
src_file.process('src/umath/matmul.h.src'),
'src/umath/ufunc_type_resolution.c',
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 128aed779..c7e28af9c 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -1017,6 +1017,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'),
join('src', 'umath', 'loops_modulo.dispatch.c.src'),
join('src', 'umath', 'loops_comparison.dispatch.c.src'),
+ join('src', 'umath', 'loops_unary_complex.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h'),
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 06945d091..47540969e 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -540,7 +540,21 @@ NPY_NO_EXPORT void
* #TYPE = CFLOAT, CDOUBLE#
*/
/**begin repeat1
- * #kind = add, subtract, multiply, conjugate, absolute, square#
+ * #kind = add, subtract, multiply, conjugate, square#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_unary_complex.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = absolute#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index 183362cf2..57ffa169b 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -1,6 +1,6 @@
/*@targets
** $maxopt baseline
- ** sse2 (avx2 fma3) avx512f
+ ** sse2 (avx2 fma3)
** neon asimd
** vsx2 vsx3
** vx vxe
diff --git a/numpy/core/src/umath/loops_unary_complex.dispatch.c.src b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
new file mode 100644
index 000000000..052ad464c
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
@@ -0,0 +1,139 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 (avx2 fma3) avx512f
+ ** neon asimd
+ ** vsx2 vsx3
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
+ */
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
+{
+ const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+ const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+ re = npyv_abs_@sfx@(re);
+ im = npyv_abs_@sfx@(im);
+ /*
+ * If real or imag = INF, then convert it to inf + j*inf
+ * Handles: inf + j*nan, nan + j*inf
+ */
+ npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+ npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+ im = npyv_select_@sfx@(re_infmask, inf, im);
+ re = npyv_select_@sfx@(im_infmask, inf, re);
+ /*
+ * If real or imag = NAN, then convert it to nan + j*nan
+ * Handles: x + j*nan, nan + j*x
+ */
+ npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+ npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+ im = npyv_select_@sfx@(re_nnanmask, im, nan);
+ re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+ npyv_@sfx@ larger = npyv_max_@sfx@(re, im);
+ npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+ /*
+ * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+ */
+ npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+ npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+ npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+ npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+ npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+ npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+ ));
+ return npyv_mul_@sfx@(hypot, larger);
+}
+#endif // VECTOR
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
+ * #c = f, #
+ * #C = F, #
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VECTOR@
+ npy_intp len = dimensions[0];
+ npy_intp ssrc = steps[0] / sizeof(@ftype@);
+ npy_intp sdst = steps[1] / sizeof(@ftype@);
+
+ if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
+ npyv_loadable_stride_@sfx@(ssrc) && npyv_storable_stride_@sfx@(sdst)
+ && steps[0] % sizeof(@ftype@) == 0
+ && steps[1] % sizeof(@ftype@) == 0
+ ) {
+ const @ftype@ *src = (@ftype@*)args[0];
+ @ftype@ *dst = (@ftype@*)args[1];
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * 2;
+ const int hstep = vstep / 2;
+
+ if (ssrc == 2 && sdst == 1) {
+ for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) {
+ npyv_@sfx@x2 ab = npyv_load_@sfx@x2(src);
+ npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+ npyv_store_@sfx@(dst, r);
+ }
+ }
+ else {
+ for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ re_im0 = npyv_loadn2_@sfx@(src, ssrc);
+ npyv_@sfx@ re_im1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+ npyv_@sfx@x2 ab = npyv_unzip_@sfx@(re_im0, re_im1);
+ npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+ npyv_storen_@sfx@(dst, sdst, r);
+ }
+ }
+ for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ rl = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+ npyv_@sfx@ im = npyv_loadn_tillz_@sfx@(src + 1, ssrc, len);
+ npyv_@sfx@ r = simd_cabsolute_@sfx@(rl, im);
+ npyv_storen_till_@sfx@(dst, sdst, len, r);
+ }
+ npyv_cleanup();
+ npy_clear_floatstatus_barrier((char*)&len);
+ return;
+ }
+#endif
+ UNARY_LOOP {
+ const @ftype@ re = ((@ftype@ *)ip1)[0];
+ const @ftype@ im = ((@ftype@ *)ip1)[1];
+ *((@ftype@ *)op1) = npy_hypot@c@(re, im);
+ }
+}
+/**end repeat**/