diff options
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/code_generators/generate_umath.py | 36 | ||||
| -rw-r--r-- | numpy/core/setup.py | 1 | ||||
| -rw-r--r-- | numpy/core/src/common/npy_svml.h | 2 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.c.src | 53 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.h.src | 18 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_umath_fp.dispatch.c.src | 141 | ||||
| -rw-r--r-- | numpy/core/src/umath/simd.inc.src | 204 |
7 files changed, 169 insertions, 286 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 66f053a43..3a27a34cd 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -636,7 +636,7 @@ defdict = { docstrings.get('numpy.core.umath.arccos'), None, TD('e', f='acos', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='acos', astype={'e': 'f'}), TD(P, f='arccos'), ), @@ -645,7 +645,7 @@ defdict = { docstrings.get('numpy.core.umath.arccosh'), None, TD('e', f='acosh', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='acosh', astype={'e': 'f'}), TD(P, f='arccosh'), ), @@ -654,7 +654,7 @@ defdict = { docstrings.get('numpy.core.umath.arcsin'), None, TD('e', f='asin', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='asin', astype={'e': 'f'}), TD(P, f='arcsin'), ), @@ -663,7 +663,7 @@ defdict = { docstrings.get('numpy.core.umath.arcsinh'), None, TD('e', f='asinh', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='asinh', astype={'e': 'f'}), TD(P, f='arcsinh'), ), @@ -672,7 +672,7 @@ defdict = { docstrings.get('numpy.core.umath.arctan'), None, TD('e', f='atan', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='atan', astype={'e': 'f'}), TD(P, f='arctan'), ), @@ -681,7 +681,7 @@ defdict = { docstrings.get('numpy.core.umath.arctanh'), None, TD('e', f='atanh', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='atanh', astype={'e': 'f'}), TD(P, f='arctanh'), ), @@ -691,7 +691,7 @@ defdict = { None, TD('e', f='cos', astype={'e': 'f'}), TD('f', dispatch=[('loops_trigonometric', 'f')]), - TD('d', simd=[('avx512_skx', 'd')]), + TD('d', dispatch=[('loops_umath_fp', 'd')]), TD('fdg' + cmplx, f='cos'), TD(P, f='cos'), ), @@ -701,7 +701,7 @@ defdict = { None, TD('e', f='sin', astype={'e': 'f'}), TD('f', dispatch=[('loops_trigonometric', 'f')]), - TD('d', simd=[('avx512_skx', 'd')]), + TD('d', dispatch=[('loops_umath_fp', 'd')]), TD('fdg' + cmplx, f='sin'), TD(P, f='sin'), ), @@ -710,7 +710,7 @@ defdict = { docstrings.get('numpy.core.umath.tan'), None, TD('e', f='tan', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='tan', astype={'e': 'f'}), TD(P, f='tan'), ), @@ -719,7 +719,7 @@ defdict = { docstrings.get('numpy.core.umath.cosh'), None, TD('e', f='cosh', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='cosh', astype={'e': 'f'}), TD(P, f='cosh'), ), @@ -728,7 +728,7 @@ defdict = { docstrings.get('numpy.core.umath.sinh'), None, TD('e', f='sinh', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='sinh', astype={'e': 'f'}), TD(P, f='sinh'), ), @@ -737,7 +737,7 @@ defdict = { docstrings.get('numpy.core.umath.tanh'), None, TD('e', f='tanh', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='tanh', astype={'e': 'f'}), TD(P, f='tanh'), ), @@ -755,7 +755,7 @@ defdict = { docstrings.get('numpy.core.umath.exp2'), None, TD('e', f='exp2', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='exp2', astype={'e': 'f'}), TD(P, f='exp2'), ), @@ -764,7 +764,7 @@ defdict = { docstrings.get('numpy.core.umath.expm1'), None, TD('e', f='expm1', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='expm1', astype={'e': 'f'}), TD(P, f='expm1'), ), @@ -782,7 +782,7 @@ defdict = { docstrings.get('numpy.core.umath.log2'), None, TD('e', f='log2', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='log2', astype={'e': 'f'}), TD(P, f='log2'), ), @@ -791,7 +791,7 @@ defdict = { docstrings.get('numpy.core.umath.log10'), None, TD('e', f='log10', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='log10', astype={'e': 'f'}), TD(P, f='log10'), ), @@ -800,7 +800,7 @@ defdict = { docstrings.get('numpy.core.umath.log1p'), None, TD('e', f='log1p', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(inexact, f='log1p', astype={'e': 'f'}), TD(P, f='log1p'), ), @@ -818,7 +818,7 @@ defdict = { docstrings.get('numpy.core.umath.cbrt'), None, TD('e', f='cbrt', astype={'e': 'f'}), - TD('fd', simd=[('avx512_skx', 'fd')]), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), TD(flts, f='cbrt', astype={'e': 'f'}), TD(P, f='cbrt'), ), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 0a1d6c8c3..26836e004 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -935,6 +935,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), + join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), join('src', 'umath', 'matmul.h.src'), join('src', 'umath', 'matmul.c.src'), diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h index da98a7ff1..4292f7090 100644 --- a/numpy/core/src/common/npy_svml.h +++ b/numpy/core/src/common/npy_svml.h @@ -1,4 +1,4 @@ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) extern __m512 __svml_exp2f16(__m512 x); extern __m512 __svml_log2f16(__m512 x); extern __m512 __svml_log10f16(__m512 x); diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 5b27e618e..b1afa69a7 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1532,59 +1532,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const * */ /**begin repeat - * #TYPE = DOUBLE, FLOAT# - * #type = npy_double, npy_float# - * #vsub = , f# - */ - -/**begin repeat1 - * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# - * #glibcfunc = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# - */ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = npy_@glibcfunc@@vsub@(in1); - } -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@_avx512_skx(char **args, npy_intp *dimensions, npy_intp *steps, void *data) -{ - if (!run_unary_avx512_skx_@glibcfunc@_@TYPE@(args, dimensions, steps)) { - @TYPE@_@func@(args, dimensions, steps, data); - } -} - -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #func = sin, cos# - */ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -DOUBLE_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP { - const npy_double in1 = *(npy_double *)ip1; - *(npy_double *)op1 = npy_@func@(in1); - } -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -DOUBLE_@func@_avx512_skx(char **args, npy_intp *dimensions, npy_intp *steps, void *data) -{ - if (!run_unary_avx512_skx_@func@_DOUBLE(args, dimensions, steps)) { - DOUBLE_@func@(args, dimensions, steps, data); - } -} -/**end repeat**/ - -/**begin repeat * #func = rint, ceil, floor, trunc# * #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc# */ diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index feb95ad82..0938cd050 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -210,6 +210,10 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, /**end repeat1**/ /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_umath_fp.dispatch.h" +#endif + /**begin repeat * #TYPE = FLOAT, DOUBLE# */ @@ -217,11 +221,8 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# */ -NPY_NO_EXPORT void -@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@TYPE@_@func@_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) /**end repeat1**/ /**end repeat**/ @@ -230,11 +231,8 @@ NPY_NO_EXPORT void * #func = sin, cos# */ -NPY_NO_EXPORT void -DOUBLE_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -DOUBLE_@func@_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) /**end repeat**/ diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src new file mode 100644 index 000000000..852604655 --- /dev/null +++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src @@ -0,0 +1,141 @@ +/*@targets + ** $maxopt baseline avx512_skx + */ +#include "numpy/npy_math.h" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "npy_svml.h" +#include "fast_loop_macros.h" + +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) +/**begin repeat + * #sfx = f32, f64# + * #func_suffix = f16, 8# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# + * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0# + */ +static void +simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc, + npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_@sfx@ x; + #if @default_val@ + if (ssrc == 1) { + x = npyv_load_till_@sfx@(src, len, @default_val@); + } else { + x = npyv_loadn_till_@sfx@(src, ssrc, len, @default_val@); + } + #else + if (ssrc == 1) { + x = npyv_load_tillz_@sfx@(src, len); + } else { + x = npyv_loadn_tillz_@sfx@(src, ssrc, len); + } + #endif + npyv_@sfx@ out = __svml_@func@@func_suffix@(x); + if (sdst == 1) { + npyv_store_till_@sfx@(dst, len, out); + } else { + npyv_storen_till_@sfx@(dst, sdst, len, out); + } + } + npyv_cleanup(); +} +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ +static void +simd_@func@_f64(const double *src, npy_intp ssrc, + double *dst, npy_intp sdst, npy_intp len) +{ + const int vstep = npyv_nlanes_f64; + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_f64 x; + if (ssrc == 1) { + x = npyv_load_tillz_f64(src, len); + } else { + x = npyv_loadn_tillz_f64(src, ssrc, len); + } + npyv_f64 out = __svml_@func@8(x); + if (sdst == 1) { + npyv_store_till_f64(dst, len, out); + } else { + npyv_storen_till_f64(dst, sdst, len, out); + } + } + npyv_cleanup(); +} +/**end repeat**/ +#endif + +/**begin repeat + * #TYPE = DOUBLE, FLOAT# + * #type = npy_double, npy_float# + * #vsub = , f# + * #sfx = f64, f32# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# + * #intrin = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) + const @type@ *src = (@type@*)args[0]; + @type@ *dst = (@type@*)args[1]; + const int lsize = sizeof(src[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + const npy_intp len = dimensions[0]; + assert(steps[0] % lsize == 0 && steps[1] % lsize == 0); + if (!is_mem_overlap(src, steps[0], dst, steps[1], len) && + npyv_loadable_stride_@sfx@(ssrc) && + npyv_storable_stride_@sfx@(sdst)) { + simd_@intrin@_@sfx@(src, ssrc, dst, sdst, len); + return; + } +#endif + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = npy_@intrin@@vsub@(in1); + } +} +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) + const double *src = (double*)args[0]; + double *dst = (double*)args[1]; + const int lsize = sizeof(src[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + const npy_intp len = dimensions[0]; + assert(steps[0] % lsize == 0 && steps[1] % lsize == 0); + if (!is_mem_overlap(src, steps[0], dst, steps[1], len) && + npyv_loadable_stride_f64(ssrc) && + npyv_storable_stride_f64(sdst)) { + simd_@func@_f64(src, ssrc, dst, sdst, len); + return; + } +#endif + UNARY_LOOP { + const npy_double in1 = *(npy_double *)ip1; + *(npy_double *)op1 = npy_@func@(in1); + } +} +/**end repeat**/ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index bca6af360..d47be9a30 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -33,7 +33,6 @@ #include <stdlib.h> #include <float.h> #include <string.h> /* for memcpy */ -#include "npy_svml.h" #define VECTOR_SIZE_BYTES 16 @@ -122,61 +121,6 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in /**end repeat**/ /**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML -static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void -AVX512_SKX_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps); -#endif - -static NPY_INLINE int -run_unary_avx512_skx_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML - if (IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP) { - AVX512_SKX_@func@_@TYPE@(args, dimensions, steps); - return 1; - } - else - return 0; -#endif - return 0; -} - -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #func = sin, cos# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML -static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void -AVX512_SKX_@func@_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps); -#endif - -static NPY_INLINE int -run_unary_avx512_skx_@func@_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML - if (IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP) { - AVX512_SKX_@func@_DOUBLE(args, dimensions, steps); - return 1; - } - else - return 0; -#endif - return 0; -} - -/**end repeat**/ - -/**begin repeat * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #EXISTS = 1, 1, 0# @@ -1188,154 +1132,6 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d #endif /**end repeat**/ - -/**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #num_lanes = 16, 8# - * #func_suffix = f16, 8# - * #mask = __mmask16, __mmask8# - * #vtype = __m512, __m512d# - * #vsuffix = ps, pd# - * #scale = 4, 8# - * #vindextype = __m512i, __m256i# - * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# - */ - -/**begin repeat1 - * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# - * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML -static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void -AVX512_SKX_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - @type@* ip = (@type@*) args[0]; - @type@* op = (@type@*) args[1]; - const npy_intp array_size = dimensions[0]; - const npy_intp stride_ip = steps[0] / (npy_intp)sizeof(@type@); - const npy_intp stride_op = steps[1] / (npy_intp)sizeof(@type@); - npy_intp num_remaining_elements = array_size; - - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum - * index will fit in an int32 as a precondition for this function via - * IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP - */ - - npy_int32 index_ip[@num_lanes@]; - for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { - index_ip[ii] = ii*stride_ip; - } - npy_int32 index_op[@num_lanes@]; - for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { - index_op[ii] = ii*stride_op; - } - const @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]); - const @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]); - - const @vtype@ val_f = _mm512_set1_@vsuffix@(@default_val@); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@(num_remaining_elements, @num_lanes@); - } - - @vtype@ x1; - if (stride_ip == 1) { - x1 = _mm512_mask_loadu_@vsuffix@(val_f, load_mask, ip); - } - else { - x1 = _mm512_mask_i32gather_@vsuffix@(val_f, load_mask, vindex_ip, ip, @scale@); - } - - @vtype@ out = __svml_@func@@func_suffix@(x1); - - if (stride_op == 1) { - _mm512_mask_storeu_@vsuffix@(op, load_mask, out); - } - else { - _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@); - } - - ip += @num_lanes@*stride_ip; - op += @num_lanes@*stride_op; - num_remaining_elements -= @num_lanes@; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #func = sin, cos# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML -static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void -AVX512_SKX_@func@_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - npy_double* ip = (npy_double*) args[0]; - npy_double* op = (npy_double*) args[1]; - const npy_intp array_size = dimensions[0]; - const npy_intp stride_ip = steps[0] / (npy_intp)sizeof(npy_double); - const npy_intp stride_op = steps[1] / (npy_intp)sizeof(npy_double); - npy_intp num_remaining_elements = array_size; - - __mmask8 load_mask = avx512_get_full_load_mask_pd(); - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum - * index will fit in an int32 as a precondition for this function via - * IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP - */ - - npy_int32 index_ip[8]; - for (npy_int32 ii = 0; ii < 8; ii++) { - index_ip[ii] = ii*stride_ip; - } - npy_int32 index_op[8]; - for (npy_int32 ii = 0; ii < 8; ii++) { - index_op[ii] = ii*stride_op; - } - const __m256i vindex_ip = _mm256_loadu_si256((__m256i*)&index_ip[0]); - const __m256i vindex_op = _mm256_loadu_si256((__m256i*)&index_op[0]); - - const __m512d val_f = _mm512_setzero_pd(); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < 8) { - load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements, 8); - } - - __m512d x1; - if (stride_ip == 1) { - x1 = _mm512_mask_loadu_pd(val_f, load_mask, ip); - } - else { - x1 = _mm512_mask_i32gather_pd(val_f, load_mask, vindex_ip, ip, 8); - } - - __m512d out = __svml_@func@8(x1); - - if (stride_op == 1) { - _mm512_mask_storeu_pd(op, load_mask, out); - } - else { - _mm512_mask_i32scatter_pd(op, load_mask, vindex_op, out, 8); - } - - ip += 8*stride_ip; - op += 8*stride_op; - num_remaining_elements -= 8; - } -} -#endif -/**end repeat**/ - /**begin repeat * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# |
