7 files changed, 169 insertions, 286 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 66f053a43..3a27a34cd 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -636,7 +636,7 @@ defdict = {
           docstrings.get('numpy.core.umath.arccos'),
           None,
           TD('e', f='acos', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='acos', astype={'e': 'f'}),
           TD(P, f='arccos'),
           ),
@@ -645,7 +645,7 @@ defdict = {
           docstrings.get('numpy.core.umath.arccosh'),
           None,
           TD('e', f='acosh', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='acosh', astype={'e': 'f'}),
           TD(P, f='arccosh'),
           ),
@@ -654,7 +654,7 @@ defdict = {
           docstrings.get('numpy.core.umath.arcsin'),
           None,
           TD('e', f='asin', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='asin', astype={'e': 'f'}),
           TD(P, f='arcsin'),
           ),
@@ -663,7 +663,7 @@ defdict = {
           docstrings.get('numpy.core.umath.arcsinh'),
           None,
           TD('e', f='asinh', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='asinh', astype={'e': 'f'}),
           TD(P, f='arcsinh'),
           ),
@@ -672,7 +672,7 @@ defdict = {
           docstrings.get('numpy.core.umath.arctan'),
           None,
           TD('e', f='atan', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='atan', astype={'e': 'f'}),
           TD(P, f='arctan'),
           ),
@@ -681,7 +681,7 @@ defdict = {
           docstrings.get('numpy.core.umath.arctanh'),
           None,
           TD('e', f='atanh', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='atanh', astype={'e': 'f'}),
           TD(P, f='arctanh'),
           ),
@@ -691,7 +691,7 @@ defdict = {
           None,
           TD('e', f='cos', astype={'e': 'f'}),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
-          TD('d', simd=[('avx512_skx', 'd')]),
+          TD('d', dispatch=[('loops_umath_fp', 'd')]),
           TD('fdg' + cmplx, f='cos'),
           TD(P, f='cos'),
           ),
@@ -701,7 +701,7 @@ defdict = {
           None,
           TD('e', f='sin', astype={'e': 'f'}),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
-          TD('d', simd=[('avx512_skx', 'd')]),
+          TD('d', dispatch=[('loops_umath_fp', 'd')]),
           TD('fdg' + cmplx, f='sin'),
           TD(P, f='sin'),
           ),
@@ -710,7 +710,7 @@ defdict = {
           docstrings.get('numpy.core.umath.tan'),
           None,
           TD('e', f='tan', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='tan', astype={'e': 'f'}),
           TD(P, f='tan'),
           ),
@@ -719,7 +719,7 @@ defdict = {
           docstrings.get('numpy.core.umath.cosh'),
           None,
           TD('e', f='cosh', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='cosh', astype={'e': 'f'}),
           TD(P, f='cosh'),
           ),
@@ -728,7 +728,7 @@ defdict = {
           docstrings.get('numpy.core.umath.sinh'),
           None,
           TD('e', f='sinh', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='sinh', astype={'e': 'f'}),
           TD(P, f='sinh'),
           ),
@@ -737,7 +737,7 @@ defdict = {
           docstrings.get('numpy.core.umath.tanh'),
           None,
           TD('e', f='tanh', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='tanh', astype={'e': 'f'}),
           TD(P, f='tanh'),
           ),
@@ -755,7 +755,7 @@ defdict = {
           docstrings.get('numpy.core.umath.exp2'),
           None,
           TD('e', f='exp2', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='exp2', astype={'e': 'f'}),
           TD(P, f='exp2'),
           ),
@@ -764,7 +764,7 @@ defdict = {
           docstrings.get('numpy.core.umath.expm1'),
           None,
           TD('e', f='expm1', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='expm1', astype={'e': 'f'}),
           TD(P, f='expm1'),
           ),
@@ -782,7 +782,7 @@ defdict = {
           docstrings.get('numpy.core.umath.log2'),
           None,
           TD('e', f='log2', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='log2', astype={'e': 'f'}),
           TD(P, f='log2'),
           ),
@@ -791,7 +791,7 @@ defdict = {
           docstrings.get('numpy.core.umath.log10'),
           None,
           TD('e', f='log10', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='log10', astype={'e': 'f'}),
           TD(P, f='log10'),
           ),
@@ -800,7 +800,7 @@ defdict = {
           docstrings.get('numpy.core.umath.log1p'),
           None,
           TD('e', f='log1p', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='log1p', astype={'e': 'f'}),
           TD(P, f='log1p'),
           ),
@@ -818,7 +818,7 @@ defdict = {
           docstrings.get('numpy.core.umath.cbrt'),
           None,
           TD('e', f='cbrt', astype={'e': 'f'}),
-          TD('fd', simd=[('avx512_skx', 'fd')]),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(flts, f='cbrt', astype={'e': 'f'}),
           TD(P, f='cbrt'),
           ),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 0a1d6c8c3..26836e004 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -935,6 +935,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
+            join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h
index da98a7ff1..4292f7090 100644
--- a/numpy/core/src/common/npy_svml.h
+++ b/numpy/core/src/common/npy_svml.h
@@ -1,4 +1,4 @@
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
 extern __m512 __svml_exp2f16(__m512 x);
 extern __m512 __svml_log2f16(__m512 x);
 extern __m512 __svml_log10f16(__m512 x);
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5b27e618e..b1afa69a7 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1532,59 +1532,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  */
 
 /**begin repeat
- *  #TYPE = DOUBLE, FLOAT#
- *  #type = npy_double, npy_float#
- *  #vsub = , f#
- */
-
-/**begin repeat1
- *  #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
- *  #glibcfunc = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP {
-	const @type@ in1 = *(@type@ *)ip1;
-	*(@type@ *)op1 = npy_@glibcfunc@@vsub@(in1);
-    }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@_avx512_skx(char **args, npy_intp *dimensions, npy_intp *steps, void *data)
-{
-    if (!run_unary_avx512_skx_@glibcfunc@_@TYPE@(args, dimensions, steps)) {
-        @TYPE@_@func@(args, dimensions, steps, data);
-    }
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- *  #func = sin, cos#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-DOUBLE_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP {
-	const npy_double in1 = *(npy_double *)ip1;
-	*(npy_double *)op1 = npy_@func@(in1);
-    }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-DOUBLE_@func@_avx512_skx(char **args, npy_intp *dimensions, npy_intp *steps, void *data)
-{
-    if (!run_unary_avx512_skx_@func@_DOUBLE(args, dimensions, steps)) {
-        DOUBLE_@func@(args, dimensions, steps, data);
-    }
-}
-/**end repeat**/
-
-/**begin repeat
  *  #func = rint, ceil, floor, trunc#
  *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
  */
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index feb95ad82..0938cd050 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -210,6 +210,10 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat1**/
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_umath_fp.dispatch.h"
+#endif
+
 /**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
@@ -217,11 +221,8 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
  * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
 
 /**end repeat1**/
 /**end repeat**/
@@ -230,11 +231,8 @@ NPY_NO_EXPORT void
  * #func = sin, cos# 
  */
 
-NPY_NO_EXPORT void
-DOUBLE_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-DOUBLE_@func@_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
 
 /**end repeat**/
 
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
new file mode 100644
index 000000000..852604655
--- /dev/null
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -0,0 +1,141 @@
+/*@targets
+ ** $maxopt baseline avx512_skx
+ */
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "npy_svml.h"
+#include "fast_loop_macros.h"
+
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+/**begin repeat
+ * #sfx = f32, f64#
+ * #func_suffix = f16, 8#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
+ */
+static void
+simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
+                        npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_@sfx@ x;
+        #if @default_val@
+            if (ssrc == 1) {
+                x = npyv_load_till_@sfx@(src, len, @default_val@);
+            } else {
+                x = npyv_loadn_till_@sfx@(src, ssrc, len, @default_val@);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_@sfx@(src, len);
+            } else {
+                x = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+            }
+        #endif
+        npyv_@sfx@ out = __svml_@func@@func_suffix@(x);
+        if (sdst == 1) {
+            npyv_store_till_@sfx@(dst, len, out);
+        } else {
+            npyv_storen_till_@sfx@(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos#
+ */
+static void
+simd_@func@_f64(const double *src, npy_intp ssrc,
+                      double *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        if (ssrc == 1) {
+            x = npyv_load_tillz_f64(src, len);
+        } else {
+            x = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+        npyv_f64 out = __svml_@func@8(x);
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat**/
+#endif
+
+/**begin repeat
+ *  #TYPE = DOUBLE, FLOAT#
+ *  #type = npy_double, npy_float#
+ *  #vsub = , f#
+ *  #sfx  = f64, f32#
+ */
+/**begin repeat1
+ *  #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ *  #intrin = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const @type@ *src = (@type@*)args[0];
+          @type@ *dst = (@type@*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_@sfx@(ssrc) &&
+        npyv_storable_stride_@sfx@(sdst)) {
+        simd_@intrin@_@sfx@(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *(@type@ *)op1 = npy_@intrin@@vsub@(in1);
+    }
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ *  #func = sin, cos#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const double *src = (double*)args[0];
+          double *dst = (double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_@func@_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_@func@(in1);
+    }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index bca6af360..d47be9a30 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -33,7 +33,6 @@
 #include <stdlib.h>
 #include <float.h>
 #include <string.h> /* for memcpy */
-#include "npy_svml.h"
 
 #define VECTOR_SIZE_BYTES 16
 
@@ -122,61 +121,6 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
 /**end repeat**/
 
 /**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- */
-/**begin repeat1
- * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
-#endif
-
-static NPY_INLINE int
-run_unary_avx512_skx_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-    if (IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
-        AVX512_SKX_@func@_@TYPE@(args, dimensions, steps);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #func = sin, cos#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps);
-#endif
-
-static NPY_INLINE int
-run_unary_avx512_skx_@func@_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-    if (IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
-        AVX512_SKX_@func@_DOUBLE(args, dimensions, steps);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-/**end repeat**/
-
-/**begin repeat
  * #type = npy_float, npy_double, npy_longdouble#
  * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  * #EXISTS = 1, 1, 0#
@@ -1188,154 +1132,6 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
 #endif
 /**end repeat**/
 
-
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #func_suffix = f16, 8#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #vsuffix = ps, pd#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- */
-
-/**begin repeat1
- * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
- * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    @type@* ip = (@type@*) args[0];
-    @type@* op = (@type@*) args[1];
-    const npy_intp array_size = dimensions[0];
-    const npy_intp stride_ip = steps[0] / (npy_intp)sizeof(@type@);
-    const npy_intp stride_op = steps[1] / (npy_intp)sizeof(@type@);
-    npy_intp num_remaining_elements = array_size;
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum
-     * index will fit in an int32 as a precondition for this function via
-     * IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP
-     */
-
-    npy_int32 index_ip[@num_lanes@];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
-        index_ip[ii] = ii*stride_ip;
-    }
-    npy_int32 index_op[@num_lanes@];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
-        index_op[ii] = ii*stride_op;
-    }
-    const @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
-    const @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
-
-    const @vtype@ val_f = _mm512_set1_@vsuffix@(@default_val@);
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(num_remaining_elements, @num_lanes@);
-        }
-
-        @vtype@ x1;
-        if (stride_ip == 1) {
-            x1 = _mm512_mask_loadu_@vsuffix@(val_f, load_mask, ip);
-        }
-        else {
-            x1 = _mm512_mask_i32gather_@vsuffix@(val_f, load_mask, vindex_ip, ip, @scale@);
-        }
-
-        @vtype@ out = __svml_@func@@func_suffix@(x1);
-
-        if (stride_op == 1) {
-            _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-        }
-        else {
-            _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
-        }
-
-        ip += @num_lanes@*stride_ip;
-        op += @num_lanes@*stride_op;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #func = sin, cos#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && defined NPY_CAN_LINK_SVML
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    npy_double* ip = (npy_double*) args[0];
-    npy_double* op = (npy_double*) args[1];
-    const npy_intp array_size = dimensions[0];
-    const npy_intp stride_ip = steps[0] / (npy_intp)sizeof(npy_double);
-    const npy_intp stride_op = steps[1] / (npy_intp)sizeof(npy_double);
-    npy_intp num_remaining_elements = array_size;
-
-    __mmask8 load_mask = avx512_get_full_load_mask_pd();
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum
-     * index will fit in an int32 as a precondition for this function via
-     * IS_UNARY_SMALL_STEPS_AND_NOMEMOVERLAP
-     */
-
-    npy_int32 index_ip[8];
-    for (npy_int32 ii = 0; ii < 8; ii++) {
-        index_ip[ii] = ii*stride_ip;
-    }
-    npy_int32 index_op[8];
-    for (npy_int32 ii = 0; ii < 8; ii++) {
-        index_op[ii] = ii*stride_op;
-    }
-    const __m256i vindex_ip = _mm256_loadu_si256((__m256i*)&index_ip[0]);
-    const __m256i vindex_op = _mm256_loadu_si256((__m256i*)&index_op[0]);
-
-    const __m512d val_f = _mm512_setzero_pd();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < 8) {
-            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements, 8);
-        }
-
-        __m512d x1;
-        if (stride_ip == 1) {
-            x1 = _mm512_mask_loadu_pd(val_f, load_mask, ip);
-        }
-        else {
-            x1 = _mm512_mask_i32gather_pd(val_f, load_mask, vindex_ip, ip, 8);
-        }
-
-        __m512d out = __svml_@func@8(x1);
-
-        if (stride_op == 1) {
-            _mm512_mask_storeu_pd(op, load_mask, out);
-        }
-        else {
-            _mm512_mask_i32scatter_pd(op, load_mask, vindex_op, out, 8);
-        }
-
-        ip += 8*stride_ip;
-        op += 8*stride_op;
-        num_remaining_elements -= 8;
-    }
-}
-#endif
-/**end repeat**/
-
 /**begin repeat
  * #type = npy_float, npy_double#
  * #TYPE = FLOAT, DOUBLE#