diff options
| author | Sayed Adel <seiko@imavr.com> | 2023-01-01 18:18:02 +0200 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2023-01-29 13:02:39 +0200 |
| commit | 783661e95a0e92372f2d1969a8c92f0d1e55e101 (patch) | |
| tree | 0b53cfdd59f17c9f401834f366a5984ff6e558b6 | |
| parent | c35f97f964a5ba263e5387ae0cc0a4393cb1da43 (diff) | |
| download | numpy-783661e95a0e92372f2d1969a8c92f0d1e55e101.tar.gz | |
ENH, SIMD: Only dispatch AVX2 for arithmetic operations
no performance gain with AVX512 enabled except for absolute
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | numpy/core/code_generators/generate_umath.py | 3 | ||||
| -rw-r--r-- | numpy/core/meson.build | 1 | ||||
| -rw-r--r-- | numpy/core/setup.py | 1 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.h.src | 16 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_arithm_fp.dispatch.c.src | 2 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_unary_complex.dispatch.c.src | 139 |
7 files changed, 160 insertions, 3 deletions
diff --git a/.gitignore b/.gitignore index d00441edc..e83c1554b 100644 --- a/.gitignore +++ b/.gitignore @@ -227,6 +227,7 @@ numpy/core/src/umath/loops_umath_fp.dispatch.c numpy/core/src/umath/loops_hyperbolic.dispatch.c numpy/core/src/umath/loops_modulo.dispatch.c numpy/core/src/umath/loops_comparison.dispatch.c +numpy/core/src/umath/loops_unary_complex.dispatch.c # npysort module numpy/core/src/npysort/x86-qsort.dispatch.c numpy/core/src/npysort/x86-qsort.dispatch.*.cpp diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 350dd19c3..bea9b44ee 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -462,7 +462,8 @@ defdict = { 'PyUFunc_AbsoluteTypeResolver', TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd'), ('loops_logical', '?')]), - TD(cmplx, dispatch=[('loops_arithm_fp', 'FD')], out=('f', 'd', 'g')), + TD(cmplx, dispatch=[('loops_unary_complex', 'FD')], + out=('f', 'd', 'g')), TD(O, f='PyNumber_Absolute'), ), '_arg': diff --git a/numpy/core/meson.build b/numpy/core/meson.build index 2e349ff5a..7cbfe6dc3 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -759,6 +759,7 @@ src_umath = [ src_file.process('src/umath/loops_unary.dispatch.c.src'), src_file.process('src/umath/loops_unary_fp.dispatch.c.src'), src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'), + src_file.process('src/umath/loops_unary_complex.dispatch.c.src'), src_file.process('src/umath/matmul.c.src'), src_file.process('src/umath/matmul.h.src'), 'src/umath/ufunc_type_resolution.c', diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 128aed779..c7e28af9c 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1017,6 +1017,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'), join('src', 'umath', 'loops_modulo.dispatch.c.src'), join('src', 'umath', 'loops_comparison.dispatch.c.src'), + join('src', 'umath', 'loops_unary_complex.dispatch.c.src'), join('src', 'umath', 'matmul.h.src'), join('src', 'umath', 'matmul.c.src'), join('src', 'umath', 'clip.h'), diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 06945d091..47540969e 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -540,7 +540,21 @@ NPY_NO_EXPORT void * #TYPE = CFLOAT, CDOUBLE# */ /**begin repeat1 - * #kind = add, subtract, multiply, conjugate, absolute, square# + * #kind = add, subtract, multiply, conjugate, square# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_complex.dispatch.h" +#endif +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + */ +/**begin repeat1 + * #kind = absolute# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src index 183362cf2..57ffa169b 100644 --- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src @@ -1,6 +1,6 @@ /*@targets ** $maxopt baseline - ** sse2 (avx2 fma3) avx512f + ** sse2 (avx2 fma3) ** neon asimd ** vsx2 vsx3 ** vx vxe diff --git a/numpy/core/src/umath/loops_unary_complex.dispatch.c.src b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src new file mode 100644 index 000000000..052ad464c --- /dev/null +++ b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src @@ -0,0 +1,139 @@ +/*@targets + ** $maxopt baseline + ** sse2 (avx2 fma3) avx512f + ** neon asimd + ** vsx2 vsx3 + ** vx vxe + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/**begin repeat + * #type = npy_float, npy_double# + * #sfx = f32, f64# + * #bsfx = b32, b64# + * #usfx = b32, u64# + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# + * #is_double = 0, 1# + * #c = f, # + * #INF = NPY_INFINITYF, NPY_INFINITY# + * #NAN = NPY_NANF, NPY_NAN# + */ +#if @VECTOR@ +NPY_FINLINE npyv_@sfx@ +simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im) +{ + const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@); + const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@); + + re = npyv_abs_@sfx@(re); + im = npyv_abs_@sfx@(im); + /* + * If real or imag = INF, then convert it to inf + j*inf + * Handles: inf + j*nan, nan + j*inf + */ + npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf); + npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf); + im = npyv_select_@sfx@(re_infmask, inf, im); + re = npyv_select_@sfx@(im_infmask, inf, re); + /* + * If real or imag = NAN, then convert it to nan + j*nan + * Handles: x + j*nan, nan + j*x + */ + npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re); + npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im); + im = npyv_select_@sfx@(re_nnanmask, im, nan); + re = npyv_select_@sfx@(im_nnanmask, re, nan); + + npyv_@sfx@ larger = npyv_max_@sfx@(re, im); + npyv_@sfx@ smaller = npyv_min_@sfx@(im, re); + /* + * Calculate div_mask to prevent 0./0. and inf/inf operations in div + */ + npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@()); + npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf); + npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask)); + + npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger); + npyv_@sfx@ hypot = npyv_sqrt_@sfx@( + npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@) + )); + return npyv_mul_@sfx@(hypot, larger); +} +#endif // VECTOR +/**end repeat**/ + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * complex types + * #TYPE = CFLOAT, CDOUBLE# + * #ftype = npy_float, npy_double# + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# + * #sfx = f32, f64# + * #c = f, # + * #C = F, # + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if @VECTOR@ + npy_intp len = dimensions[0]; + npy_intp ssrc = steps[0] / sizeof(@ftype@); + npy_intp sdst = steps[1] / sizeof(@ftype@); + + if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) && + npyv_loadable_stride_@sfx@(ssrc) && npyv_storable_stride_@sfx@(sdst) + && steps[0] % sizeof(@ftype@) == 0 + && steps[1] % sizeof(@ftype@) == 0 + ) { + const @ftype@ *src = (@ftype@*)args[0]; + @ftype@ *dst = (@ftype@*)args[1]; + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * 2; + const int hstep = vstep / 2; + + if (ssrc == 2 && sdst == 1) { + for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) { + npyv_@sfx@x2 ab = npyv_load_@sfx@x2(src); + npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]); + npyv_store_@sfx@(dst, r); + } + } + else { + for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_@sfx@ re_im0 = npyv_loadn2_@sfx@(src, ssrc); + npyv_@sfx@ re_im1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc); + npyv_@sfx@x2 ab = npyv_unzip_@sfx@(re_im0, re_im1); + npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]); + npyv_storen_@sfx@(dst, sdst, r); + } + } + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_@sfx@ rl = npyv_loadn_tillz_@sfx@(src, ssrc, len); + npyv_@sfx@ im = npyv_loadn_tillz_@sfx@(src + 1, ssrc, len); + npyv_@sfx@ r = simd_cabsolute_@sfx@(rl, im); + npyv_storen_till_@sfx@(dst, sdst, len, r); + } + npyv_cleanup(); + npy_clear_floatstatus_barrier((char*)&len); + return; + } +#endif + UNARY_LOOP { + const @ftype@ re = ((@ftype@ *)ip1)[0]; + const @ftype@ im = ((@ftype@ *)ip1)[1]; + *((@ftype@ *)op1) = npy_hypot@c@(re, im); + } +} +/**end repeat**/ |
