summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/code_generators/generate_umath.py36
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/src/common/npy_svml.h2
-rw-r--r--numpy/core/src/umath/loops.c.src53
-rw-r--r--numpy/core/src/umath/loops.h.src18
-rw-r--r--numpy/core/src/umath/loops_umath_fp.dispatch.c.src141
-rw-r--r--numpy/core/src/umath/simd.inc.src204
7 files changed, 169 insertions, 286 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 66f053a43..3a27a34cd 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -636,7 +636,7 @@ defdict = {
docstrings.get('numpy.core.umath.arccos'),
None,
TD('e', f='acos', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='acos', astype={'e': 'f'}),
TD(P, f='arccos'),
),
@@ -645,7 +645,7 @@ defdict = {
docstrings.get('numpy.core.umath.arccosh'),
None,
TD('e', f='acosh', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='acosh', astype={'e': 'f'}),
TD(P, f='arccosh'),
),
@@ -654,7 +654,7 @@ defdict = {
docstrings.get('numpy.core.umath.arcsin'),
None,
TD('e', f='asin', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='asin', astype={'e': 'f'}),
TD(P, f='arcsin'),
),
@@ -663,7 +663,7 @@ defdict = {
docstrings.get('numpy.core.umath.arcsinh'),
None,
TD('e', f='asinh', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='asinh', astype={'e': 'f'}),
TD(P, f='arcsinh'),
),
@@ -672,7 +672,7 @@ defdict = {
docstrings.get('numpy.core.umath.arctan'),
None,
TD('e', f='atan', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='atan', astype={'e': 'f'}),
TD(P, f='arctan'),
),
@@ -681,7 +681,7 @@ defdict = {
docstrings.get('numpy.core.umath.arctanh'),
None,
TD('e', f='atanh', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='atanh', astype={'e': 'f'}),
TD(P, f='arctanh'),
),
@@ -691,7 +691,7 @@ defdict = {
None,
TD('e', f='cos', astype={'e': 'f'}),
TD('f', dispatch=[('loops_trigonometric', 'f')]),
- TD('d', simd=[('avx512_skx', 'd')]),
+ TD('d', dispatch=[('loops_umath_fp', 'd')]),
TD('fdg' + cmplx, f='cos'),
TD(P, f='cos'),
),
@@ -701,7 +701,7 @@ defdict = {
None,
TD('e', f='sin', astype={'e': 'f'}),
TD('f', dispatch=[('loops_trigonometric', 'f')]),
- TD('d', simd=[('avx512_skx', 'd')]),
+ TD('d', dispatch=[('loops_umath_fp', 'd')]),
TD('fdg' + cmplx, f='sin'),
TD(P, f='sin'),
),
@@ -710,7 +710,7 @@ defdict = {
docstrings.get('numpy.core.umath.tan'),
None,
TD('e', f='tan', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='tan', astype={'e': 'f'}),
TD(P, f='tan'),
),
@@ -719,7 +719,7 @@ defdict = {
docstrings.get('numpy.core.umath.cosh'),
None,
TD('e', f='cosh', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='cosh', astype={'e': 'f'}),
TD(P, f='cosh'),
),
@@ -728,7 +728,7 @@ defdict = {
docstrings.get('numpy.core.umath.sinh'),
None,
TD('e', f='sinh', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='sinh', astype={'e': 'f'}),
TD(P, f='sinh'),
),
@@ -737,7 +737,7 @@ defdict = {
docstrings.get('numpy.core.umath.tanh'),
None,
TD('e', f='tanh', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='tanh', astype={'e': 'f'}),
TD(P, f='tanh'),
),
@@ -755,7 +755,7 @@ defdict = {
docstrings.get('numpy.core.umath.exp2'),
None,
TD('e', f='exp2', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='exp2', astype={'e': 'f'}),
TD(P, f='exp2'),
),
@@ -764,7 +764,7 @@ defdict = {
docstrings.get('numpy.core.umath.expm1'),
None,
TD('e', f='expm1', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='expm1', astype={'e': 'f'}),
TD(P, f='expm1'),
),
@@ -782,7 +782,7 @@ defdict = {
docstrings.get('numpy.core.umath.log2'),
None,
TD('e', f='log2', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='log2', astype={'e': 'f'}),
TD(P, f='log2'),
),
@@ -791,7 +791,7 @@ defdict = {
docstrings.get('numpy.core.umath.log10'),
None,
TD('e', f='log10', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='log10', astype={'e': 'f'}),
TD(P, f='log10'),
),
@@ -800,7 +800,7 @@ defdict = {
docstrings.get('numpy.core.umath.log1p'),
None,
TD('e', f='log1p', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(inexact, f='log1p', astype={'e': 'f'}),
TD(P, f='log1p'),
),
@@ -818,7 +818,7 @@ defdict = {
docstrings.get('numpy.core.umath.cbrt'),
None,
TD('e', f='cbrt', astype={'e': 'f'}),
- TD('fd', simd=[('avx512_skx', 'fd')]),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
TD(flts, f='cbrt', astype={'e': 'f'}),
TD(P, f='cbrt'),
),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 0a1d6c8c3..26836e004 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -935,6 +935,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
+ join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h
index da98a7ff1..4292f7090 100644
--- a/numpy/core/src/common/npy_svml.h
+++ b/numpy/core/src/common/npy_svml.h
@@ -1,4 +1,4 @@
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
extern __m512 __svml_exp2f16(__m512 x);
extern __m512 __svml_log2f16(__m512 x);
extern __m512 __svml_log10f16(__m512 x);
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5b27e618e..b1afa69a7 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1532,59 +1532,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
*/
/**begin repeat
- * #TYPE = DOUBLE, FLOAT#
- * #type = npy_double, npy_float#
- * #vsub = , f#
- */
-
-/**begin repeat1
- * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
- * #glibcfunc = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
-{
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *(@type@ *)op1 = npy_@glibcfunc@@vsub@(in1);
- }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@_avx512_skx(char **args, npy_intp *dimensions, npy_intp *steps, void *data)
-{
- if (!run_unary_avx512_skx_@glibcfunc@_@TYPE@(args, dimensions, steps)) {
- @TYPE@_@func@(args, dimensions, steps, data);
- }
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #func = sin, cos#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-DOUBLE_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
-{
- UNARY_LOOP {
- const npy_double in1 = *(npy_double *)ip1;
- *(npy_double *)op1 = npy_@func@(in1);
- }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-DOUBLE_@func@_avx512_skx(char **args, npy_intp *dimensions, npy_intp *steps, void *data)
-{
- if (!run_unary_avx512_skx_@func@_DOUBLE(args, dimensions, steps)) {
- DOUBLE_@func@(args, dimensions, steps, data);
- }
-}
-/**end repeat**/
-
-/**begin repeat
* #func = rint, ceil, floor, trunc#
* #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
*/
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index feb95ad82..0938cd050 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -210,6 +210,10 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
/**end repeat1**/
/**end repeat**/
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_umath_fp.dispatch.h"
+#endif
+
/**begin repeat
* #TYPE = FLOAT, DOUBLE#
*/
@@ -217,11 +221,8 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
* #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
*/
-NPY_NO_EXPORT void
-@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
/**end repeat1**/
/**end repeat**/
@@ -230,11 +231,8 @@ NPY_NO_EXPORT void
* #func = sin, cos#
*/
-NPY_NO_EXPORT void
-DOUBLE_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-DOUBLE_@func@_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
/**end repeat**/
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
new file mode 100644
index 000000000..852604655
--- /dev/null
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -0,0 +1,141 @@
+/*@targets
+ ** $maxopt baseline avx512_skx
+ */
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "npy_svml.h"
+#include "fast_loop_macros.h"
+
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+/**begin repeat
+ * #sfx = f32, f64#
+ * #func_suffix = f16, 8#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
+ */
+static void
+simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
+ npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ x;
+ #if @default_val@
+ if (ssrc == 1) {
+ x = npyv_load_till_@sfx@(src, len, @default_val@);
+ } else {
+ x = npyv_loadn_till_@sfx@(src, ssrc, len, @default_val@);
+ }
+ #else
+ if (ssrc == 1) {
+ x = npyv_load_tillz_@sfx@(src, len);
+ } else {
+ x = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+ }
+ #endif
+ npyv_@sfx@ out = __svml_@func@@func_suffix@(x);
+ if (sdst == 1) {
+ npyv_store_till_@sfx@(dst, len, out);
+ } else {
+ npyv_storen_till_@sfx@(dst, sdst, len, out);
+ }
+ }
+ npyv_cleanup();
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos#
+ */
+static void
+simd_@func@_f64(const double *src, npy_intp ssrc,
+ double *dst, npy_intp sdst, npy_intp len)
+{
+ const int vstep = npyv_nlanes_f64;
+ for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ npyv_f64 x;
+ if (ssrc == 1) {
+ x = npyv_load_tillz_f64(src, len);
+ } else {
+ x = npyv_loadn_tillz_f64(src, ssrc, len);
+ }
+ npyv_f64 out = __svml_@func@8(x);
+ if (sdst == 1) {
+ npyv_store_till_f64(dst, len, out);
+ } else {
+ npyv_storen_till_f64(dst, sdst, len, out);
+ }
+ }
+ npyv_cleanup();
+}
+/**end repeat**/
+#endif
+
+/**begin repeat
+ * #TYPE = DOUBLE, FLOAT#
+ * #type = npy_double, npy_float#
+ * #vsub = , f#
+ * #sfx = f64, f32#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ * #intrin = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+ const @type@ *src = (@type@*)args[0];
+ @type@ *dst = (@type@*)args[1];
+ const int lsize = sizeof(src[0]);
+ const npy_intp ssrc = steps[0] / lsize;
+ const npy_intp sdst = steps[1] / lsize;
+ const npy_intp len = dimensions[0];
+ assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+ if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+ npyv_loadable_stride_@sfx@(ssrc) &&
+ npyv_storable_stride_@sfx@(sdst)) {
+ simd_@intrin@_@sfx@(src, ssrc, dst, sdst, len);
+ return;
+ }
+#endif
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *(@type@ *)op1 = npy_@intrin@@vsub@(in1);
+ }
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+ const double *src = (double*)args[0];
+ double *dst = (double*)args[1];
+ const int lsize = sizeof(src[0]);
+ const npy_intp ssrc = steps[0] / lsize;
+ const npy_intp sdst = steps[1] / lsize;
+ const npy_intp len = dimensions[0];
+ assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+ if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+ npyv_loadable_stride_f64(ssrc) &&
+ npyv_storable_stride_f64(sdst)) {
+ simd_@func@_f64(src, ssrc, dst, sdst, len);
+ return;
+ }
+#endif
+ UNARY_LOOP {
+ const npy_double in1 = *(npy_double *)ip1;
+ *(npy_double *)op1 = npy_@func@(in1);
+ }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index bca6af360..d47be9a30 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -33,7 +33,6 @@
#include <stdlib.h>
#include <float.h>
#include <string.h> /* for memcpy */
-#include "npy_svml.h"
#define VECTOR_SIZE_BYTES 16
@@ -122,61 +121,6 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
/**end repeat**/
/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- */
-/**begin repeat1
- * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
-#endif
-
-static NPY_INLINE int
-run_unary_avx512_skx_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
- if (IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
- AVX512_SKX_@func@_@TYPE@(args, dimensions, steps);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #func = sin, cos#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps);
-#endif
-
-static NPY_INLINE int
-run_unary_avx512_skx_@func@_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
- if (IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
- AVX512_SKX_@func@_DOUBLE(args, dimensions, steps);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
-}
-
-/**end repeat**/
-
-/**begin repeat
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
* #EXISTS = 1, 1, 0#
@@ -1188,154 +1132,6 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
#endif
/**end repeat**/
-
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #func_suffix = f16, 8#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #vsuffix = ps, pd#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- */
-
-/**begin repeat1
- * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
- * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
- @type@* ip = (@type@*) args[0];
- @type@* op = (@type@*) args[1];
- const npy_intp array_size = dimensions[0];
- const npy_intp stride_ip = steps[0] / (npy_intp)sizeof(@type@);
- const npy_intp stride_op = steps[1] / (npy_intp)sizeof(@type@);
- npy_intp num_remaining_elements = array_size;
-
- @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum
- * index will fit in an int32 as a precondition for this function via
- * IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP
- */
-
- npy_int32 index_ip[@num_lanes@];
- for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
- index_ip[ii] = ii*stride_ip;
- }
- npy_int32 index_op[@num_lanes@];
- for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
- index_op[ii] = ii*stride_op;
- }
- const @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
- const @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
-
- const @vtype@ val_f = _mm512_set1_@vsuffix@(@default_val@);
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask = avx512_get_partial_load_mask_@vsuffix@(num_remaining_elements, @num_lanes@);
- }
-
- @vtype@ x1;
- if (stride_ip == 1) {
- x1 = _mm512_mask_loadu_@vsuffix@(val_f, load_mask, ip);
- }
- else {
- x1 = _mm512_mask_i32gather_@vsuffix@(val_f, load_mask, vindex_ip, ip, @scale@);
- }
-
- @vtype@ out = __svml_@func@@func_suffix@(x1);
-
- if (stride_op == 1) {
- _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
- }
- else {
- _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
- }
-
- ip += @num_lanes@*stride_ip;
- op += @num_lanes@*stride_op;
- num_remaining_elements -= @num_lanes@;
- }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #func = sin, cos#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
- npy_double* ip = (npy_double*) args[0];
- npy_double* op = (npy_double*) args[1];
- const npy_intp array_size = dimensions[0];
- const npy_intp stride_ip = steps[0] / (npy_intp)sizeof(npy_double);
- const npy_intp stride_op = steps[1] / (npy_intp)sizeof(npy_double);
- npy_intp num_remaining_elements = array_size;
-
- __mmask8 load_mask = avx512_get_full_load_mask_pd();
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum
- * index will fit in an int32 as a precondition for this function via
- * IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP
- */
-
- npy_int32 index_ip[8];
- for (npy_int32 ii = 0; ii < 8; ii++) {
- index_ip[ii] = ii*stride_ip;
- }
- npy_int32 index_op[8];
- for (npy_int32 ii = 0; ii < 8; ii++) {
- index_op[ii] = ii*stride_op;
- }
- const __m256i vindex_ip = _mm256_loadu_si256((__m256i*)&index_ip[0]);
- const __m256i vindex_op = _mm256_loadu_si256((__m256i*)&index_op[0]);
-
- const __m512d val_f = _mm512_setzero_pd();
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < 8) {
- load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements, 8);
- }
-
- __m512d x1;
- if (stride_ip == 1) {
- x1 = _mm512_mask_loadu_pd(val_f, load_mask, ip);
- }
- else {
- x1 = _mm512_mask_i32gather_pd(val_f, load_mask, vindex_ip, ip, 8);
- }
-
- __m512d out = __svml_@func@8(x1);
-
- if (stride_op == 1) {
- _mm512_mask_storeu_pd(op, load_mask, out);
- }
- else {
- _mm512_mask_i32scatter_pd(op, load_mask, vindex_op, out, 8);
- }
-
- ip += 8*stride_ip;
- op += 8*stride_op;
- num_remaining_elements -= 8;
- }
-}
-#endif
-/**end repeat**/
-
/**begin repeat
* #type = npy_float, npy_double#
* #TYPE = FLOAT, DOUBLE#