8 files changed, 1347 insertions, 1247 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 6ee8031cb..b5305fbfc 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -722,8 +722,7 @@ defdict = {
           docstrings.get('numpy.core.umath.exp'),
           None,
           TD('e', f='exp', astype={'e':'f'}),
-          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
-          TD('d', simd=[('avx512f', 'd')]),
+          TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
           TD('fdg' + cmplx, f='exp'),
           TD(P, f='exp'),
           ),
@@ -746,8 +745,7 @@ defdict = {
           docstrings.get('numpy.core.umath.log'),
           None,
           TD('e', f='log', astype={'e':'f'}),
-          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
-          TD('d', simd=[('avx512f', 'd')]),
+          TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
           TD('fdg' + cmplx, f='log'),
           TD(P, f='log'),
           ),
@@ -920,10 +918,10 @@ defdict = {
           docstrings.get('numpy.core.umath.ldexp'),
           None,
           [TypeDescription('e', None, 'ei', 'e'),
-          TypeDescription('f', None, 'fi', 'f', simd=['avx512_skx']),
+          TypeDescription('f', None, 'fi', 'f', dispatch='loops_exponent_log'),
           TypeDescription('e', FuncNameSuffix('long'), 'el', 'e'),
           TypeDescription('f', FuncNameSuffix('long'), 'fl', 'f'),
-          TypeDescription('d', None, 'di', 'd', simd=['avx512_skx']),
+          TypeDescription('d', None, 'di', 'd', dispatch='loops_exponent_log'),
           TypeDescription('d', FuncNameSuffix('long'), 'dl', 'd'),
           TypeDescription('g', None, 'gi', 'g'),
           TypeDescription('g', FuncNameSuffix('long'), 'gl', 'g'),
@@ -934,8 +932,8 @@ defdict = {
           docstrings.get('numpy.core.umath.frexp'),
           None,
           [TypeDescription('e', None, 'e', 'ei'),
-          TypeDescription('f', None, 'f', 'fi', simd=['avx512_skx']),
-          TypeDescription('d', None, 'd', 'di', simd=['avx512_skx']),
+          TypeDescription('f', None, 'f', 'fi', dispatch='loops_exponent_log'),
+          TypeDescription('d', None, 'd', 'di', dispatch='loops_exponent_log'),
           TypeDescription('g', None, 'g', 'gi'),
           ],
           ),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 1042a1c45..dfb26c9c1 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -920,6 +920,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
+            join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h.src'),
@@ -929,7 +931,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'scalarmath.c.src'),
             join('src', 'umath', 'ufunc_type_resolution.c'),
             join('src', 'umath', 'override.c'),
-            join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             ]
 
     umath_deps = [
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index dbcff8793..b81795b96 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -10,6 +10,21 @@
 #ifndef _NPY_UMATH_FAST_LOOP_MACROS_H_
 #define _NPY_UMATH_FAST_LOOP_MACROS_H_
 
+/*
+ * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
+ * Very large step size can be as slow as processing it using scalar. The
+ * value of 2097152 ( = 2MB) was chosen using 2 considerations:
+ * 1) Typical linux kernel page size is 4Kb, but sometimes it could also be 2MB
+ *    which is == 2097152 Bytes. For a step size as large as this, surely all
+ *    the loads/stores of gather/scatter instructions falls on 16 different pages
+ *    which one would think would slow down gather/scatter instructions.
+ * 2) It additionally satisfies MAX_STEP_SIZE*16/esize < NPY_MAX_INT32 which
+ *    allows us to use i32 version of gather/scatter (as opposed to the i64 version)
+ *    without problems (step larger than NPY_MAX_INT32*esize/16 would require use of
+ *    i64gather/scatter). esize = element size = 4/8 bytes for float/double.
+ */
+#define MAX_STEP_SIZE 2097152
+
 static NPY_INLINE npy_uintp
 abs_ptrdiff(char *a, char *b)
 {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index ba538d2ab..570b3ec04 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1658,39 +1658,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
 /**end repeat**/
 
 /**begin repeat
- *  #func = exp, log#
- *  #scalarf = npy_expf, npy_logf#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-FLOAT_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP {
-	const npy_float in1 = *(npy_float *)ip1;
-	*(npy_float *)op1 = @scalarf@(in1);
-    }
-}
-
-/**end repeat**/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP {
-        const npy_double in1 = *(npy_double *)ip1;
-        *(npy_double *)op1 = npy_exp(in1);
-    }
-}
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP {
-        const npy_double in1 = *(npy_double *)ip1;
-        *(npy_double *)op1 = npy_log(in1);
-    }
-}
-
-/**begin repeat
  * #isa = avx512f, fma#
  * #ISA = AVX512F, FMA#
  * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
@@ -1720,59 +1687,8 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
 
 /**end repeat2**/
 /**end repeat1**/
-
-/**begin repeat1
- *  #func = exp, log#
- *  #scalarf = npy_expf, npy_logf#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_@func@_FLOAT(args, dimensions, steps)) {
-        UNARY_LOOP {
-            /*
-             * We use the AVX function to compute exp/log for scalar elements as well.
-             * This is needed to ensure the output of strided and non-strided
-             * cases match. SIMD code handles strided input cases, but not
-             * strided output.
-             */
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-            @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
-#else
-            const npy_float in1 = *(npy_float *)ip1;
-            *(npy_float *)op1 = @scalarf@(in1);
-#endif
-        }
-    }
-}
-
-/**end repeat1**/
-
 /**end repeat**/
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_avx512f_exp_DOUBLE(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const npy_double in1 = *(npy_double *)ip1;
-            *(npy_double *)op1 = npy_exp(in1);
-        }
-    }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_avx512f_log_DOUBLE(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const npy_double in1 = *(npy_double *)ip1;
-            *(npy_double *)op1 = npy_log(in1);
-        }
-    }
-}
-
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
@@ -2045,41 +1961,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_TWO_OUT {
-        const @type@ in1 = *(@type@ *)ip1;
-        *((@type@ *)op1) = npy_frexp@c@(in1, (int *)op2);
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_frexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
-{
-    if (!run_unary_two_out_avx512_skx_frexp_@TYPE@(args, dimensions, steps)) {
-        @TYPE@_frexp(args, dimensions, steps, func);
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const int in2 = *(int *)ip2;
-        *((@type@ *)op1) = npy_ldexp@c@(in1, in2);
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_ldexp_avx512_skx(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
-    if (!run_binary_avx512_skx_ldexp_@TYPE@(args, dimensions, steps)) {
-        @TYPE@_ldexp(args, dimensions, steps, func);
-    }
-}
-
-NPY_NO_EXPORT void
 @TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     /*
@@ -2179,6 +2060,25 @@ LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps
     }
 }
 
+NPY_NO_EXPORT void
+LONGDOUBLE_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_TWO_OUT {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = npy_frexpl(in1, (int *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const int in2 = *(int *)ip2;
+        *((npy_longdouble *)op1) = npy_ldexpl(in1, in2);
+    }
+}
+
 /*
  *****************************************************************************
  **                          HALF-FLOAT LOOPS                               **
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index d73c9fa7f..b3a19be12 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -213,18 +213,6 @@ NPY_NO_EXPORT void
 /**end repeat1**/
 /**end repeat**/
 
-NPY_NO_EXPORT void
-DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_trigonometric.dispatch.h"
 #endif
@@ -236,19 +224,18 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
 ))
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_exponent_log.dispatch.h"
+#endif
 /**begin repeat
- *  #func = exp, log#
+ *  #TYPE = FLOAT, DOUBLE#
  */
-NPY_NO_EXPORT void
-FLOAT_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 /**begin repeat1
- * #isa = avx512f, fma#
+ * # kind = exp, log, frexp, ldexp#
  */
-
-NPY_NO_EXPORT void
-FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
 /**end repeat1**/
 /**end repeat**/
 
@@ -373,15 +360,9 @@ NPY_NO_EXPORT void
 @TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-@TYPE@_frexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
 @TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-@TYPE@_ldexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
 @TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat**/
 
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
new file mode 100644
index 000000000..1dc24b226
--- /dev/null
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -0,0 +1,1293 @@
+/*@targets
+ ** $maxopt baseline
+ ** (avx2 fma3) avx512f avx512_skx
+ **/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <float.h>
+
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+#include "npy_simd_data.h"
+
+// TODO: tweak & replace raw SIMD with NPYV
+
+/********************************************************************************
+ ** bunch of helper functions used in ISA_exp/log_FLOAT
+ ********************************************************************************/
+#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
+    /**
+     * For somehow MSVC commit aggressive optimization lead
+     * to raises 'RuntimeWarning: RuntimeWarning: overflow encountered in exp'
+     *
+     * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
+     * investigate about it while moving to NPYV.
+     */
+    #define SIMD_AVX512F
+#elif defined(NPY_HAVE_AVX2) && defined(NPY_HAVE_FMA3)
+    #define SIMD_AVX2_FMA3
+#endif
+#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512_SKX)
+    #define SIMD_AVX512_SKX
+#endif
+#if defined(SIMD_AVX512F) && !(defined(__clang__) && (__clang_major__ < 10 || \
+                              (__clang_major__ == 10 && __clang_minor__ < 1)))
+    #define SIMD_AVX512F_NOCLANG_BUG
+#endif
+
+#ifdef SIMD_AVX2_FMA3
+
+static NPY_INLINE __m256
+fma_get_full_load_mask_ps(void)
+{
+    return _mm256_set1_ps(-1.0);
+}
+
+static NPY_INLINE __m256i
+fma_get_full_load_mask_pd(void)
+{
+    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
+}
+
+static NPY_INLINE __m256
+fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
+{
+    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
+                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+    float* addr = maskint + num_lanes - num_elem;
+    return _mm256_loadu_ps(addr);
+}
+
+static NPY_INLINE __m256i
+fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
+{
+    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
+    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
+    return _mm256_loadu_si256((__m256i*) addr);
+}
+
+static NPY_INLINE __m256
+fma_masked_gather_ps(__m256 src,
+                     npy_float* addr,
+                     __m256i vindex,
+                     __m256 mask)
+{
+    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
+}
+
+static NPY_INLINE __m256d
+fma_masked_gather_pd(__m256d src,
+                     npy_double* addr,
+                     __m128i vindex,
+                     __m256d mask)
+{
+    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
+}
+
+static NPY_INLINE __m256
+fma_masked_load_ps(__m256 mask, npy_float* addr)
+{
+    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
+}
+
+static NPY_INLINE __m256d
+fma_masked_load_pd(__m256i mask, npy_double* addr)
+{
+    return _mm256_maskload_pd(addr, mask);
+}
+
+static NPY_INLINE __m256
+fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
+{
+    return _mm256_blendv_ps(x, val, mask);
+}
+
+static NPY_INLINE __m256d
+fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
+{
+    return _mm256_blendv_pd(x, val, mask);
+}
+
+static NPY_INLINE __m256
+fma_blend(__m256 x, __m256 y, __m256 ymask)
+{
+    return _mm256_blendv_ps(x, y, ymask);
+}
+
+static NPY_INLINE __m256
+fma_invert_mask_ps(__m256 ymask)
+{
+    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
+}
+
+static NPY_INLINE __m256i
+fma_invert_mask_pd(__m256i ymask)
+{
+    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
+}
+
+static NPY_INLINE __m256
+fma_get_exponent(__m256 x)
+{
+    /*
+     * Special handling of denormals:
+     * 1) Multiply denormal elements with 2**100 (0x71800000)
+     * 2) Get the 8 bits of unbiased exponent
+     * 3) Subtract 100 from exponent of denormals
+     */
+
+    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
+    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
+    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
+
+    /*
+     * It is necessary for temp1 to be volatile, a bug in clang optimizes it out which leads
+     * to an overflow warning in some cases. See https://github.com/numpy/numpy/issues/18005
+     */
+    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
+    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
+    x = _mm256_blendv_ps(x, temp, denormal_mask);
+
+    __m256 exp = _mm256_cvtepi32_ps(
+                    _mm256_sub_epi32(
+                        _mm256_srli_epi32(
+                            _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E)));
+
+    __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
+    return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
+}
+
+static NPY_INLINE __m256
+fma_get_mantissa(__m256 x)
+{
+    /*
+     * Special handling of denormals:
+     * 1) Multiply denormal elements with 2**100 (0x71800000)
+     * 2) Get the 23 bits of mantissa
+     * 3) Mantissa for denormals is not affected by the multiplication
+     */
+
+    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
+    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
+    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
+
+    /*
+     * It is necessary for temp1 to be volatile, a bug in clang optimizes it out which leads
+     * to an overflow warning in some cases. See https://github.com/numpy/numpy/issues/18005
+     */
+    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
+    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
+    x = _mm256_blendv_ps(x, temp, denormal_mask);
+
+    __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
+    __m256i exp_126_bits  = _mm256_set1_epi32(126 << 23);
+    return _mm256_castsi256_ps(
+                _mm256_or_si256(
+                    _mm256_and_si256(
+                        _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
+}
+
+static NPY_INLINE __m256
+fma_scalef_ps(__m256 poly, __m256 quadrant)
+{
+    /*
+     * Handle denormals (which occur when quadrant <= -125):
+     * 1) This function computes poly*(2^quad) by adding the exponent of
+     poly to quad
+     * 2) When quad <= -125, the output is a denormal and the above logic
+     breaks down
+     * 3) To handle such cases, we split quadrant: -125 + (quadrant + 125)
+     * 4) poly*(2^-125) is computed the usual way
+     * 5) 2^(quad-125) can be computed by: 2 << abs(quad-125)
+     * 6) The final div operation generates the denormal
+     */
+     __m256 minquadrant = _mm256_set1_ps(-125.0f);
+     __m256 denormal_mask = _mm256_cmp_ps(quadrant, minquadrant, _CMP_LE_OQ);
+     if (_mm256_movemask_ps(denormal_mask) != 0x0000) {
+        __m256 quad_diff = _mm256_sub_ps(quadrant, minquadrant);
+        quad_diff = _mm256_sub_ps(_mm256_setzero_ps(), quad_diff);
+        quad_diff = _mm256_blendv_ps(_mm256_setzero_ps(), quad_diff, denormal_mask);
+        __m256i two_power_diff = _mm256_sllv_epi32(
+                                   _mm256_set1_epi32(1), _mm256_cvtps_epi32(quad_diff));
+        quadrant = _mm256_max_ps(quadrant, minquadrant); //keep quadrant >= -126
+        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
+        poly = _mm256_castsi256_ps(
+                   _mm256_add_epi32(
+                       _mm256_castps_si256(poly), exponent));
+        __m256 denorm_poly = _mm256_div_ps(poly, _mm256_cvtepi32_ps(two_power_diff));
+        return _mm256_blendv_ps(poly, denorm_poly, denormal_mask);
+     }
+     else {
+        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
+        poly = _mm256_castsi256_ps(
+                   _mm256_add_epi32(
+                       _mm256_castps_si256(poly), exponent));
+        return poly;
+     }
+}
+
+#endif // SIMD_AVX2_FMA3
+
+#ifdef SIMD_AVX512F
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_get_full_load_mask_ps(void)
+{
+    return 0xFFFF;
+}
+
+static NPY_INLINE __mmask8
+avx512_get_full_load_mask_pd(void)
+{
+    return 0xFF;
+}
+
+static NPY_INLINE __mmask16
+avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x0001 << num_elem) - 0x0001;
+}
+
+static NPY_INLINE __mmask8
+avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x01 << num_elem) - 0x01;
+}
+
+static NPY_INLINE __m512
+avx512_masked_gather_ps(__m512 src,
+                        npy_float* addr,
+                        __m512i vindex,
+                        __mmask16 kmask)
+{
+    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
+}
+
+static NPY_INLINE __m512d
+avx512_masked_gather_pd(__m512d src,
+                        npy_double* addr,
+                        __m256i vindex,
+                        __mmask8 kmask)
+{
+    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
+}
+
+static NPY_INLINE __m512
+avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
+{
+    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
+}
+
+static NPY_INLINE __m512d
+avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+{
+    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+}
+
+static NPY_INLINE __m512
+avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
+{
+    return _mm512_mask_blend_ps(mask, x, val);
+}
+
+static NPY_INLINE __m512d
+avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
+{
+    return _mm512_mask_blend_pd(mask, x, val);
+}
+
+static NPY_INLINE __m512
+avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
+{
+    return _mm512_mask_mov_ps(x, ymask, y);
+}
+
+static NPY_INLINE __mmask16
+avx512_invert_mask_ps(__mmask16 ymask)
+{
+    return _mm512_knot(ymask);
+}
+
+static NPY_INLINE __mmask8
+avx512_invert_mask_pd(__mmask8 ymask)
+{
+    return _mm512_knot(ymask);
+}
+
+static NPY_INLINE __m512
+avx512_get_exponent(__m512 x)
+{
+    return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
+}
+
+static NPY_INLINE __m512
+avx512_get_mantissa(__m512 x)
+{
+    return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
+}
+
+static NPY_INLINE __m512
+avx512_scalef_ps(__m512 poly, __m512 quadrant)
+{
+    return _mm512_scalef_ps(poly, quadrant);
+}
+
+static NPY_INLINE __m512d
+avx512_permute_x4var_pd(__m512d t0,
+                        __m512d t1,
+                        __m512d t2,
+                        __m512d t3,
+                        __m512i index)
+{
+    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+                          _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index),
+                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
+    __m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
+    __m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
+    return _mm512_mask_blend_pd(lut_mask, res1, res2);
+}
+
+static NPY_INLINE __m512d
+avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
+                        __m512d t4, __m512d t5, __m512d t6, __m512d t7,
+                        __m512i index)
+{
+    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+                          _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index),
+                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
+    __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index);
+    __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index);
+    return _mm512_mask_blend_pd(lut_mask, res1, res2);
+}
+
+#endif // SIMD_AVX512F
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/**begin repeat
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
+ * #vtype = __m256, __m512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #NUM_LANES = 8, 16#
+ * #mask = __m256, __mmask16#
+ * #vsub = , _mask#
+ * #or_masks =_mm256_or_ps, _mm512_kor#
+ * #and_masks =_mm256_and_ps, _mm512_kand#
+ * #xor_masks =_mm256_xor_ps, _mm512_kxor#
+ * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
+ * #mask_to_int = _mm256_movemask_ps, #
+ * #full_mask= 0xFF, 0xFFFF#
+ * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #CHK = SIMD_AVX2_FMA3, SIMD_AVX512F#
+ */
+#ifdef @CHK@
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+static NPY_INLINE @vtype@
+simd_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
+{
+    @vtype@ reduced_x = @fmadd@(y, c1, x);
+    reduced_x = @fmadd@(y, c2, reduced_x);
+    reduced_x = @fmadd@(y, c3, reduced_x);
+    return reduced_x;
+}
+/*
+ * Vectorized implementation of exp using AVX2 and AVX512:
+ * 1) if x >= xmax; return INF (overflow)
+ * 2) if x <= xmin; return 0.0f (underflow)
+ * 3) Range reduction (using Coyd-Waite):
+ *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
+ * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
+ *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
+ *      algorithm (mini-max polynomial approximation)
+ * 5) Compute exp(x) = exp(y) * 2^k
+ * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
+ * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
+ * same x = 0xc2781e37)
+ */
+static void
+simd_exp_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
+    npy_float xmax = 88.72283935546875f;
+    npy_float xmin = -103.97208404541015625f;
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_OUTPUT_BLOCKABLE_UNARY
+     */
+    npy_int32 indexarr[16];
+    for (npy_int32 ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    /* Load up frequently used constants */
+    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
+    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
+    @vtype@ exp_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_EXPf);
+    @vtype@ exp_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_EXPf);
+    @vtype@ exp_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_EXPf);
+    @vtype@ exp_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_EXPf);
+    @vtype@ exp_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_EXPf);
+    @vtype@ exp_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_EXPf);
+    @vtype@ exp_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_EXPf);
+    @vtype@ exp_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_EXPf);
+    @vtype@ exp_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_EXPf);
+    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
+    @vtype@ log2e = _mm@vsize@_set1_ps(NPY_LOG2Ef);
+    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
+    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
+    @vtype@ poly, num_poly, denom_poly, quadrant;
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
+
+    @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
+    @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
+    npy_intp num_remaining_elements = array_size;
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes) {
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
+        }
+
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load_ps(load_mask, ip);
+        }
+        else {
+            x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
+        }
+
+        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
+        x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask);
+
+        xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
+        xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
+        inf_mask = _mm@vsize@_cmp_ps@vsub@(x, inf, _CMP_EQ_OQ);
+        overflow_mask = @or_masks@(overflow_mask,
+                                    @xor_masks@(xmax_mask, inf_mask));
+
+        x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
+                                    @or_masks@(nan_mask, xmin_mask), xmax_mask));
+
+        quadrant = _mm@vsize@_mul_ps(x, log2e);
+
+        /* round to nearest */
+        quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
+        quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
+
+        /* Cody-Waite's range reduction algorithm */
+        x = simd_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);
+
+        num_poly = @fmadd@(exp_p5, x, exp_p4);
+        num_poly = @fmadd@(num_poly, x, exp_p3);
+        num_poly = @fmadd@(num_poly, x, exp_p2);
+        num_poly = @fmadd@(num_poly, x, exp_p1);
+        num_poly = @fmadd@(num_poly, x, exp_p0);
+        denom_poly = @fmadd@(exp_q2, x, exp_q1);
+        denom_poly = @fmadd@(denom_poly, x, exp_q0);
+        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
+
+        /*
+         * compute val = poly * 2^quadrant; which is same as adding the
+         * exponent of quadrant to the exponent of poly. quadrant is an int,
+         * so extracting exponent is simply extracting 8 bits.
+         */
+        poly = @isa@_scalef_ps(poly, quadrant);
+
+        /*
+         * elem > xmax; return inf
+         * elem < xmin; return 0.0f
+         * elem = +/- nan, return nan
+         */
+        poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask);
+
+        @masked_store@(op, @cvtps_epi32@(load_mask), poly);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (@mask_to_int@(overflow_mask)) {
+        npy_set_floatstatus_overflow();
+    }
+}
+
+/*
+ * Vectorized implementation of log using AVX2 and AVX512
+ * 1) if x < 0.0f; return -NAN (invalid input)
+ * 2) Range reduction: y = x/2^k;
+ *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
+ * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
+ *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
+ *      algorithm (mini-max polynomial approximation)
+ * 5) Compute log(x) = log(y) + k*ln(2)
+ * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
+ * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
+ * x = 0x3f486945)
+ */
+
+static void
+simd_log_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_OUTPUT_BLOCKABLE_UNARY
+     */
+    npy_int32 indexarr[16];
+    for (npy_int32 ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    /* Load up frequently used constants */
+    @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
+    @vtype@ log_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_LOGf);
+    @vtype@ log_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_LOGf);
+    @vtype@ log_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_LOGf);
+    @vtype@ log_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_LOGf);
+    @vtype@ log_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_LOGf);
+    @vtype@ log_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_LOGf);
+    @vtype@ log_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_LOGf);
+    @vtype@ log_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_LOGf);
+    @vtype@ log_q3 = _mm@vsize@_set1_ps(NPY_COEFF_Q3_LOGf);
+    @vtype@ log_q4 = _mm@vsize@_set1_ps(NPY_COEFF_Q4_LOGf);
+    @vtype@ log_q5 = _mm@vsize@_set1_ps(NPY_COEFF_Q5_LOGf);
+    @vtype@ loge2 = _mm@vsize@_set1_ps(NPY_LOGE2f);
+    @vtype@ nan = _mm@vsize@_set1_ps(NPY_NANF);
+    @vtype@ neg_nan = _mm@vsize@_set1_ps(-NPY_NANF);
+    @vtype@ neg_inf = _mm@vsize@_set1_ps(-NPY_INFINITYF);
+    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
+    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
+    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr);
+    @vtype@ poly, num_poly, denom_poly, exponent;
+
+    @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
+    @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
+    @mask@ divide_by_zero_mask = invalid_mask;
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
+    npy_intp num_remaining_elements = array_size;
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes) {
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
+        }
+
+        @vtype@ x_in;
+        if (stride == 1) {
+            x_in = @isa@_masked_load_ps(load_mask, ip);
+        }
+        else {
+            x_in  = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
+        }
+
+        negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
+        zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
+        inf_mask = _mm@vsize@_cmp_ps@vsub@(x_in, inf, _CMP_EQ_OQ);
+        nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
+        divide_by_zero_mask = @or_masks@(divide_by_zero_mask,
+                                        @and_masks@(zero_mask, load_mask));
+        invalid_mask = @or_masks@(invalid_mask, negx_mask);
+
+        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask);
+
+        /* set x = normalized mantissa */
+        exponent = @isa@_get_exponent(x);
+        x = @isa@_get_mantissa(x);
+
+        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
+        sqrt2_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
+        x = @isa@_blend(x, _mm@vsize@_add_ps(x,x), sqrt2_mask);
+        exponent = @isa@_blend(exponent,
+                               _mm@vsize@_sub_ps(exponent,ones_f), sqrt2_mask);
+
+        /* x = x - 1 */
+        x = _mm@vsize@_sub_ps(x, ones_f);
+
+        /* Polynomial approximation for log(1+x) */
+        num_poly = @fmadd@(log_p5, x, log_p4);
+        num_poly = @fmadd@(num_poly, x, log_p3);
+        num_poly = @fmadd@(num_poly, x, log_p2);
+        num_poly = @fmadd@(num_poly, x, log_p1);
+        num_poly = @fmadd@(num_poly, x, log_p0);
+        denom_poly = @fmadd@(log_q5, x, log_q4);
+        denom_poly = @fmadd@(denom_poly, x, log_q3);
+        denom_poly = @fmadd@(denom_poly, x, log_q2);
+        denom_poly = @fmadd@(denom_poly, x, log_q1);
+        denom_poly = @fmadd@(denom_poly, x, log_q0);
+        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
+        poly = @fmadd@(exponent, loge2, poly);
+
+        /*
+         * x < 0.0f; return -NAN
+         * x = +/- NAN; return NAN
+         * x = 0.0f; return -INF
+         */
+        poly = @isa@_set_masked_lanes_ps(poly, nan, nan_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, neg_nan, negx_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, neg_inf, zero_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, inf, inf_mask);
+
+        @masked_store@(op, @cvtps_epi32@(load_mask), poly);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (@mask_to_int@(invalid_mask)) {
+        npy_set_floatstatus_invalid();
+    }
+    if (@mask_to_int@(divide_by_zero_mask)) {
+        npy_set_floatstatus_divbyzero();
+    }
+}
+#endif // @CHK@
+/**end repeat**/
+
+#ifdef SIMD_AVX512F_NOCLANG_BUG
+/*
+ * Vectorized implementation of exp double using AVX512
+ * Reference: Tang, P.T.P., "Table-driven implementation of the
+ *  exponential function in IEEE floating-point
+ *  arithmetic," ACM Transactions on Mathematical
+ *  Software, vol. 15, pp. 144-157, 1989.
+ * 1) if x > mTH_max or x is INF; return INF (overflow)
+ * 2) if x < mTH_min; return 0.0f (underflow)
+ * 3) if abs(x) < mTH_nearzero; return 1.0f + x
+ * 4) if x is Nan; return Nan
+ * 5) Range reduction:
+ *    x = (32m + j)ln2 / 32 + r; r in [-ln2/64, ln2/64]
+ * 6) exp(r) - 1 is approximated by a polynomial function p(r)
+ *    exp(x) = 2^m(2^(j/32) + 2^(j/32)p(r));
+ */
+static void
+AVX512F_exp_DOUBLE(npy_double * op,
+                npy_double * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    npy_intp num_remaining_elements = array_size;
+    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
+    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
+    npy_int32 indexarr[8];
+    for (npy_int32 ii = 0; ii < 8; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    __m512d InvLn2N = _mm512_set1_pd(NPY_INV_LN2_MUL_32);
+    __m512d mShift = _mm512_set1_pd(NPY_RINT_CVT_MAGIC);
+    __m512d mNegL1 = _mm512_set1_pd(NPY_TANG_NEG_L1);
+    __m512d mNegL2 = _mm512_set1_pd(NPY_TANG_NEG_L2);
+    __m512i mMod = _mm512_set1_epi64(0x1f);
+    __m512d mA1 = _mm512_set1_pd(NPY_TANG_A1);
+    __m512d mA2 = _mm512_set1_pd(NPY_TANG_A2);
+    __m512d mA3 = _mm512_set1_pd(NPY_TANG_A3);
+    __m512d mA4 = _mm512_set1_pd(NPY_TANG_A4);
+    __m512d mA5 = _mm512_set1_pd(NPY_TANG_A5);
+    __m512d mTH_nearzero = _mm512_set1_pd(0x1p-54);
+    __m512d mTH_max = _mm512_set1_pd(0x1.62e42fefa39efp+9);
+    __m512d mTH_min = _mm512_set1_pd(-0x1.74910d52d3053p+9);
+    __m512d mTH_inf = _mm512_set1_pd(NPY_INFINITY);
+    __m512d zeros_d = _mm512_set1_pd(0.0f);
+    __m512d ones_d = _mm512_set1_pd(1.0f);
+    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
+
+    __m512d mTable_top_0 = _mm512_loadu_pd(&(EXP_Table_top[8*0]));
+    __m512d mTable_top_1 = _mm512_loadu_pd(&(EXP_Table_top[8*1]));
+    __m512d mTable_top_2 = _mm512_loadu_pd(&(EXP_Table_top[8*2]));
+    __m512d mTable_top_3 = _mm512_loadu_pd(&(EXP_Table_top[8*3]));
+    __m512d mTable_tail_0 = _mm512_loadu_pd(&(EXP_Table_tail[8*0]));
+    __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1]));
+    __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2]));
+    __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));
+
+    __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+    __mmask8 load_mask = avx512_get_full_load_mask_pd();
+    __mmask8 xmin_mask, xmax_mask, inf_mask, nan_mask, nearzero_mask;
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
+                                                      num_lanes);
+        }
+
+        __m512d x;
+        if (1 == stride) {
+            x = avx512_masked_load_pd(load_mask, ip);
+        }
+        else {
+            x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
+        }
+
+        nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ);
+        x = avx512_set_masked_lanes_pd(x, zeros_d, nan_mask);
+        xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ);
+        xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ);
+        inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ);
+        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x),
+                                _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
+        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs),
+                                    mTH_nearzero, _CMP_LT_OQ);
+        nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
+        overflow_mask = _mm512_kor(overflow_mask,
+                                _mm512_kxor(xmax_mask, inf_mask));
+        x = avx512_set_masked_lanes_pd(x, zeros_d,
+                        _mm512_kor(_mm512_kor(nan_mask, xmin_mask),
+                            _mm512_kor(xmax_mask, nearzero_mask)));
+
+        /* z = x * 32/ln2 */
+        __m512d z = _mm512_mul_pd(x, InvLn2N);
+
+        /* round to nearest */
+        __m512d kd = _mm512_add_pd(z, mShift);
+        __m512i ki = _mm512_castpd_si512(kd);
+        kd = _mm512_sub_pd(kd, mShift);
+
+        /* r = (x + kd*mNegL1) + kd*mNegL2 */
+        __m512d r1 = _mm512_fmadd_pd(kd, mNegL1, x);
+        __m512d r2 = _mm512_mul_pd(kd, mNegL2);
+        __m512d r = _mm512_add_pd(r1,r2);
+
+        /* Polynomial approximation for exp(r) - 1 */
+        __m512d q = _mm512_fmadd_pd(mA5, r, mA4);
+        q = _mm512_fmadd_pd(q, r, mA3);
+        q = _mm512_fmadd_pd(q, r, mA2);
+        q = _mm512_fmadd_pd(q, r, mA1);
+        q = _mm512_mul_pd(q, r);
+        __m512d p = _mm512_fmadd_pd(r, q, r2);;
+        p = _mm512_add_pd(r1, p);
+
+        /* Get 2^(j/32) from lookup table */
+        __m512i j = _mm512_and_epi64(ki, mMod);
+        __m512d top = avx512_permute_x4var_pd(mTable_top_0, mTable_top_1,
+                                  mTable_top_2, mTable_top_3, j);
+        __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1,
+                                  mTable_tail_2, mTable_tail_3, j);
+
+        /*
+         * s = top + tail;
+         * exp(x) = 2^m * (top + (tail + s * p));
+         */
+        __m512d s = _mm512_add_pd(top, tail);
+        __m512d res = _mm512_fmadd_pd(s, p, tail);
+        res = _mm512_add_pd(res, top);
+        res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32)));
+
+        /* return special cases */
+        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d),
+                                        nearzero_mask);
+        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN),
+                                        nan_mask);
+        res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask);
+        res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask);
+
+        _mm512_mask_storeu_pd(op, load_mask, res);
+
+        ip += num_lanes * stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+    if (overflow_mask) {
+        npy_set_floatstatus_overflow();
+    }
+}
+/*
+ * Vectorized implementation of log double using AVX512
+ * Reference:
+ * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions
+ *     and their error analysis. No. CONF-9106103-1. Argonne National Lab.,
+ *     IL (USA), 1991.
+ * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm
+ *     function in IEEE floating-point arithmetic." ACM Transactions on
+ *     Mathematical Software (TOMS) 16.4 (1990): 378-400.
+ * [3] Muller, Jean-Michel. "Elementary functions: algorithms and
+ *     implementation." (2016).
+ * 1) if x = 0; return -INF
+ * 2) if x < 0; return NAN
+ * 3) if x is INF; return INF
+ * 4) if x is NAN; return NAN
+ * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log()
+ * 6) Range reduction:
+ *    log(x) = log(2^m * z)
+ *           = mln2 + log(z)
+ * 7) log(z) = log(z / c_k) + log(c_k);
+ *    where c_k = 1 + k/64, k = 0,1,...,64
+ *    s.t. |x - c_k| <= 1/128 when x on[1,2].
+ * 8) r = 2(x - c_k)/(x + c_k)
+ *    log(x/c_k) = log((1 + r/2) / (1 - r/2))
+ *               = p(r)
+ *               = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
+ */
+static void
+AVX512F_log_DOUBLE(npy_double * op,
+                npy_double * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    npy_intp num_remaining_elements = array_size;
+    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
+    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
+    npy_int32 indexarr[8];
+    for (npy_int32 ii = 0; ii < 8; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    __m512d zeros_d = _mm512_set1_pd(0.0f);
+    __m512d ones_d = _mm512_set1_pd(1.0f);
+    __m512d mInf = _mm512_set1_pd(NPY_INFINITY);
+    __m512d mInv64 = _mm512_castsi512_pd(_mm512_set1_epi64(0x3f90000000000000));
+    __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN);
+    __m512d mNan = _mm512_set1_pd(NPY_NAN);
+    __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY);
+    __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1);
+    __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2);
+    __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3);
+    __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4);
+    __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI);
+    __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO);
+
+    __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4);
+    __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4);
+    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
+
+    /* Load lookup table data */
+    /**begin repeat
+     * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+     */
+
+    __m512d mLUT_TOP_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*@i@]));
+    __m512d mLUT_TAIL_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*@i@]));
+
+    /**end repeat**/
+
+    __mmask8 load_mask = avx512_get_full_load_mask_pd();
+    __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+    __mmask8 divide_by_zero_mask = invalid_mask;
+
+    __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask,
+             glibc_mask;
+
+    __m512d x_in;
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
+                                                      num_lanes);
+        }
+
+        if (1 == stride) {
+            x_in = avx512_masked_load_pd(load_mask, ip);
+        }
+        else {
+            x_in = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
+        }
+
+        /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */
+        __mmask8 m1 = _mm512_cmp_pd_mask(x_in, mTo_glibc_max, _CMP_LT_OQ);
+        __mmask8 m2 = _mm512_cmp_pd_mask(x_in, mTo_glibc_min, _CMP_GT_OQ);
+        glibc_mask =  m1 & m2;
+
+        if (glibc_mask != 0xFF) {
+            zero_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_EQ_OQ);
+            inf_mask = _mm512_cmp_pd_mask(x_in, mInf, _CMP_EQ_OQ);
+            negx_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_LT_OQ);
+            nan_mask = _mm512_cmp_pd_mask(x_in, x_in, _CMP_NEQ_UQ);
+
+            divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask);
+            invalid_mask = invalid_mask | negx_mask;
+
+            __m512d x = avx512_set_masked_lanes_pd(x_in, zeros_d, negx_mask);
+            __m512i ix = _mm512_castpd_si512(x);
+
+            /* Normalize x when it is denormal */
+            __m512i top12 = _mm512_and_epi64(ix,
+                                _mm512_set1_epi64(0xfff0000000000000));
+            denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
+                                _CMP_EQ_OQ);
+            denormal_mask = (~zero_mask) & denormal_mask;
+            ix = _mm512_castpd_si512(_mm512_mask_mul_pd(x, denormal_mask,
+                                    x, _mm512_set1_pd(0x1p52)));
+            ix = _mm512_mask_sub_epi64(ix, denormal_mask,
+                                    ix, _mm512_set1_epi64(52ULL << 52));
+
+            /*
+             * x = 2^k * z; where z in range [1,2]
+             */
+            __m512i tmp = _mm512_sub_epi64(ix,
+                              _mm512_set1_epi64(0x3ff0000000000000));
+            __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6),
+                            _mm512_set1_epi64(0x3fULL));
+            __m512i ik = _mm512_srai_epi64(tmp, 52);
+            __m512d z = _mm512_castsi512_pd(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp,
+                            _mm512_set1_epi64(0xfff0000000000000))));
+            /* c = i/64 + 1 */
+            __m256i i_32 = _mm512_cvtepi64_epi32(i);
+            __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d);
+
+            /* u = 2 * (z - c) / (z + c) */
+            __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c));
+            u = _mm512_mul_pd(_mm512_set1_pd(2.0), u);
+
+            /* v = u * u */
+            __m512d v = _mm512_mul_pd(u,u);
+
+            /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */
+            __m512d res = _mm512_fmadd_pd(v, mA4, mA3);
+            res = _mm512_fmadd_pd(v, res, mA2);
+            res = _mm512_fmadd_pd(v, res, mA1);
+            res = _mm512_mul_pd(v, res);
+            res = _mm512_fmadd_pd(u, res, u);
+
+            /* Load lookup table data */
+            __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1,
+                            mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5,
+                            mLUT_TOP_6, mLUT_TOP_7, i);
+            __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1,
+                              mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5,
+                              mLUT_TAIL_6, mLUT_TAIL_7, i);
+
+            /*
+             * log(x) = k * ln2_hi + c_hi +
+             *          k * ln2_lo + c_lo +
+             *          log(z/c)
+             */
+            __m256i ik_32 = _mm512_cvtepi64_epi32(ik);
+            __m512d k = _mm512_cvtepi32_pd(ik_32);
+            __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi);
+            __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo);
+            tt = _mm512_add_pd(tt, tt2);
+            res = _mm512_add_pd(tt, res);
+
+            /* return special cases */
+            res = avx512_set_masked_lanes_pd(res, mNan, nan_mask);
+            res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask);
+            res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask);
+            res = avx512_set_masked_lanes_pd(res, mInf, inf_mask);
+
+            _mm512_mask_storeu_pd(op, load_mask, res);
+        }
+
+        /* call glibc's log func when x around 1.0f */
+        if (glibc_mask != 0) {
+            double NPY_DECL_ALIGNED(64) ip_fback[8];
+            _mm512_store_pd(ip_fback, x_in);
+
+            for (int ii = 0; ii < 8; ++ii, glibc_mask >>= 1) {
+                if (glibc_mask & 0x01) {
+                    op[ii] = npy_log(ip_fback[ii]);
+                }
+            }
+        }
+        ip += num_lanes * stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (invalid_mask) {
+        npy_set_floatstatus_invalid();
+    }
+    if (divide_by_zero_mask) {
+        npy_set_floatstatus_divbyzero();
+    }
+}
+#endif // AVX512F_NOCLANG_BUG
+
+#ifdef SIMD_AVX512_SKX
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #num_lanes = 16, 8#
+ * #vsuffix = ps, pd#
+ * #mask = __mmask16, __mmask8#
+ * #vtype1 = __m512, __m512d#
+ * #vtype2 = __m512i, __m256i#
+ * #scale = 4, 8#
+ * #vindextype = __m512i, __m256i#
+ * #vindexsize = 512, 256#
+ * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
+ * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32#
+ * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32#
+ * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32#
+ * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
+ * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
+ */
+static NPY_INLINE void
+AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
+    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
+    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = array_size;
+    @type@* ip1 = (@type@*) args[0];
+    int* ip2 = (int*) args[1];
+    @type@* op  = (@type@*) args[2];
+
+    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
+     */
+
+    npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
+    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
+        index_ip1[ii] = ii*stride_ip1;
+        index_ip2[ii] = ii*stride_ip2;
+        index_op[ii] = ii*stride_op;
+    }
+    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
+    @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
+    @vindextype@ vindex_op  = @vindexload@((@vindextype@*)&index_op[0]);
+    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
+    @vtype2@ zeros = @setzero@();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+        }
+        @vtype1@ x1;
+        @vtype2@ x2;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+        }
+        else {
+            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
+        }
+        if (stride_ip2 == 1) {
+            x2 = @vtype2_load@(load_mask, ip2);
+        }
+        else {
+            x2 = @vtype2_gather@(zeros, load_mask, vindex_ip2, ip2, 4);
+        }
+
+        @vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2));
+
+        if (stride_op == 1) {
+            _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+        }
+        else {
+            /* scatter! */
+            _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
+        }
+
+        ip1 += @num_lanes@*stride_ip1;
+        ip2 += @num_lanes@*stride_ip2;
+        op += @num_lanes@*stride_op;
+        num_remaining_elements -= @num_lanes@;
+    }
+}
+
+static NPY_INLINE void
+AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
+    const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(@type@);
+    const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = array_size;
+    @type@* ip1 = (@type@*) args[0];
+    @type@* op1  = (@type@*) args[1];
+    int* op2 = (int*) args[2];
+
+    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
+     */
+
+    npy_int32 index_ip1[@num_lanes@], index_op1[@num_lanes@], index_op2[@num_lanes@];
+    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
+        index_ip1[ii] = ii*stride_ip1;
+        index_op1[ii] = ii*stride_op1;
+        index_op2[ii] = ii*stride_op2;
+    }
+    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
+    @vindextype@ vindex_op1 = @vindexload@((@vindextype@*)&index_op1[0]);
+    @vindextype@ vindex_op2 = @vindexload@((@vindextype@*)&index_op2[0]);
+    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+        }
+        @vtype1@ x1;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+        }
+        else {
+            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
+        }
+
+        /*
+         * The x86 instructions vpgetmant and vpgetexp do not conform
+         * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
+         * We mask these values with spmask to avoid invalid exceptions.
+         */
+        @mask@ spmask =_mm512_knot(_mm512_fpclass_@vsuffix@_mask(
+                                                x1, 0b10011111));
+        @vtype1@ out1 = _mm512_maskz_getmant_@vsuffix@(
+                                spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
+        out1 = _mm512_mask_mov_@vsuffix@(x1, spmask, out1);
+        @vtype2@ out2 = _mm512_cvt@vsuffix@_epi32(
+                            _mm512_maskz_add_@vsuffix@(spmask, _mm512_set1_@vsuffix@(1.0),
+                                _mm512_maskz_getexp_@vsuffix@(spmask, x1)));
+        if (stride_op1 == 1) {
+            _mm512_mask_storeu_@vsuffix@(op1, load_mask, out1);
+        }
+        else {
+            _mm512_mask_i32scatter_@vsuffix@(op1, load_mask, vindex_op1, out1, @scale@);
+        }
+        if (stride_op2 == 1) {
+            @vtype2_store@(op2, load_mask, out2);
+        }
+        else {
+            @vtype2_scatter@(op2, load_mask, vindex_op2, out2, 4);
+        }
+
+        ip1 += @num_lanes@*stride_ip1;
+        op1 += @num_lanes@*stride_op1;
+        op2 += @num_lanes@*stride_op2;
+        num_remaining_elements -= @num_lanes@;
+    }
+}
+/**end repeat**/
+#endif // SIMD_AVX512_SKX
+
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #func = exp, log#
+ * #scalarf = npy_expf, npy_logf#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if defined(SIMD_AVX2_FMA3) || defined(SIMD_AVX512F)
+    // third arg in `IS_OUTPUT_BLOCKABLE_UNARY` is dummy
+    // TODO: get ride of this macro during the move to NPYV
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), 64)) {
+        simd_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
+    }
+    else {
+        UNARY_LOOP {
+            /*
+             * We use the AVX function to compute exp/log for scalar elements as well.
+             * This is needed to ensure the output of strided and non-strided
+             * cases match. SIMD code handles strided input cases, but not
+             * strided output.
+             */
+            simd_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
+        }
+    }
+#else
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = @scalarf@(in1);
+    }
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ * #func = exp, log#
+ * #scalar = npy_exp, npy_log#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#ifdef SIMD_AVX512F_NOCLANG_BUG
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
+        AVX512F_@func@_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = @scalar@(in1);
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #c = f, #
+ *  #C = F, #
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_frexp)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifdef SIMD_AVX512_SKX
+    if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
+        AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
+        return;
+    }
+#endif
+    UNARY_LOOP_TWO_OUT {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((@type@ *)op1) = npy_frexp@c@(in1, (int *)op2);
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_ldexp)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifdef SIMD_AVX512_SKX
+    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
+        AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
+        return;
+    }
+#endif
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const int in2 = *(int *)ip2;
+        *((@type@ *)op1) = npy_ldexp@c@(in1, in2);
+    }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h
index be9288aff..62438d7a3 100644
--- a/numpy/core/src/umath/npy_simd_data.h
+++ b/numpy/core/src/umath/npy_simd_data.h
@@ -1,6 +1,6 @@
 #ifndef __NPY_SIMD_DATA_H_
 #define __NPY_SIMD_DATA_H_
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
+#if defined NPY_HAVE_AVX512F
 #if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
 /*
  * Constants used in vector implementation of float64 exp(x)
@@ -122,11 +122,11 @@ static npy_uint64 EXP_Table_tail[32] = {
 
 /*
  * Lookup table of log(c_k)
- * Reference form: Tang, Ping-Tak Peter. "Table-driven implementation of the 
- *     logarithm function in IEEE floating-point arithmetic." ACM Transactions 
+ * Reference form: Tang, Ping-Tak Peter. "Table-driven implementation of the
+ *     logarithm function in IEEE floating-point arithmetic." ACM Transactions
  *     on Mathematical Software (TOMS) 16.4 (1990): 378-400.
  */
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
+#if defined NPY_HAVE_AVX512F
 #if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
 static npy_uint64 LOG_TABLE_TOP[64] = {
     0x0000000000000000,
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index e66763986..1a345b1fb 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -37,21 +37,6 @@
 #define VECTOR_SIZE_BYTES 16
 
 /*
- * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
- * Very large step size can be as slow as processing it using scalar. The
- * value of 2097152 ( = 2MB) was chosen using 2 considerations:
- * 1) Typical linux kernel page size is 4Kb, but sometimes it could also be 2MB
- *    which is == 2097152 Bytes. For a step size as large as this, surely all
- *    the loads/stores of gather/scatter instructions falls on 16 different pages
- *    which one would think would slow down gather/scatter instructions.
- * 2) It additionally satisfies MAX_STEP_SIZE*16/esize < NPY_MAX_INT32 which
- *    allows us to use i32 version of gather/scatter (as opposed to the i64 version)
- *    without problems (step larger than NPY_MAX_INT32*esize/16 would require use of
- *    i64gather/scatter). esize = element size = 4/8 bytes for float/double.
- */
-#define MAX_STEP_SIZE 2097152
-
-/*
  * Dispatcher functions
  * decide whether the operation can be vectorized and run it
  * if it was run returns true and false if nothing was done
@@ -134,42 +119,6 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
 
 
 /**end repeat1**/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
-
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
-#endif
-
-static NPY_INLINE int
-run_binary_avx512_skx_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
-        AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-static NPY_INLINE int
-run_unary_two_out_avx512_skx_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-    if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
-        AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
 /**end repeat**/
 
 /**begin repeat
@@ -245,74 +194,8 @@ run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
 
 /**end repeat2**/
 /**end repeat1**/
-
-/**begin repeat1
- * #func = exp, log#
- */
-
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE void
-@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
-#endif
-
-static NPY_INLINE int
-run_unary_@isa@_@func@_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), @REGISTER_SIZE@)) {
-        @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-
 /**end repeat**/
 
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE void
-AVX512F_exp_DOUBLE(npy_double *, npy_double *, const npy_intp n, const npy_intp stride);
-#endif
-static NPY_INLINE int
-run_unary_avx512f_exp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
-        AVX512F_exp_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-#endif
-    return 0;
-}
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE void
-AVX512F_log_DOUBLE(npy_double *, npy_double *, const npy_intp n, const npy_intp stride);
-#endif
-static NPY_INLINE int
-run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
-        AVX512F_log_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-#endif
-    return 0;
-}
-
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
@@ -956,106 +839,6 @@ fma_invert_mask_pd(__m256i ymask)
     return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_exponent(__m256 x)
-{
-    /*
-     * Special handling of denormals:
-     * 1) Multiply denormal elements with 2**100 (0x71800000)
-     * 2) Get the 8 bits of unbiased exponent
-     * 3) Subtract 100 from exponent of denormals
-     */
-
-    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
-    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
-    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
-
-    /*
-     * It is necessary for temp1 to be volatile, a bug in clang optimizes it out which leads
-     * to an overflow warning in some cases. See https://github.com/numpy/numpy/issues/18005
-     */
-    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
-    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
-    x = _mm256_blendv_ps(x, temp, denormal_mask);
-
-    __m256 exp = _mm256_cvtepi32_ps(
-                    _mm256_sub_epi32(
-                        _mm256_srli_epi32(
-                            _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E)));
-
-    __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
-    return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_mantissa(__m256 x)
-{
-    /*
-     * Special handling of denormals:
-     * 1) Multiply denormal elements with 2**100 (0x71800000)
-     * 2) Get the 23 bits of mantissa
-     * 3) Mantissa for denormals is not affected by the multiplication
-     */
-
-    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
-    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
-    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
-
-    /*
-     * It is necessary for temp1 to be volatile, a bug in clang optimizes it out which leads
-     * to an overflow warning in some cases. See https://github.com/numpy/numpy/issues/18005
-     */
-    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
-    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
-    x = _mm256_blendv_ps(x, temp, denormal_mask);
-
-    __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
-    __m256i exp_126_bits  = _mm256_set1_epi32(126 << 23);
-    return _mm256_castsi256_ps(
-                _mm256_or_si256(
-                    _mm256_and_si256(
-                        _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-fma_scalef_ps(__m256 poly, __m256 quadrant)
-{
-    /*
-     * Handle denormals (which occur when quadrant <= -125):
-     * 1) This function computes poly*(2^quad) by adding the exponent of
-     poly to quad
-     * 2) When quad <= -125, the output is a denormal and the above logic
-     breaks down
-     * 3) To handle such cases, we split quadrant: -125 + (quadrant + 125)
-     * 4) poly*(2^-125) is computed the usual way
-     * 5) 2^(quad-125) can be computed by: 2 << abs(quad-125)
-     * 6) The final div operation generates the denormal
-     */
-     __m256 minquadrant = _mm256_set1_ps(-125.0f);
-     __m256 denormal_mask = _mm256_cmp_ps(quadrant, minquadrant, _CMP_LE_OQ);
-     if (_mm256_movemask_ps(denormal_mask) != 0x0000) {
-        __m256 quad_diff = _mm256_sub_ps(quadrant, minquadrant);
-        quad_diff = _mm256_sub_ps(_mm256_setzero_ps(), quad_diff);
-        quad_diff = _mm256_blendv_ps(_mm256_setzero_ps(), quad_diff, denormal_mask);
-        __m256i two_power_diff = _mm256_sllv_epi32(
-                                   _mm256_set1_epi32(1), _mm256_cvtps_epi32(quad_diff));
-        quadrant = _mm256_max_ps(quadrant, minquadrant); //keep quadrant >= -126
-        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
-        poly = _mm256_castsi256_ps(
-                   _mm256_add_epi32(
-                       _mm256_castps_si256(poly), exponent));
-        __m256 denorm_poly = _mm256_div_ps(poly, _mm256_cvtepi32_ps(two_power_diff));
-        return _mm256_blendv_ps(poly, denorm_poly, denormal_mask);
-     }
-     else {
-        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
-        poly = _mm256_castsi256_ps(
-                   _mm256_add_epi32(
-                       _mm256_castps_si256(poly), exponent));
-        return poly;
-     }
-}
-
 /**begin repeat
  *  #vsub = ps, pd#
  *  #vtype = __m256, __m256d#
@@ -1183,52 +966,6 @@ avx512_invert_mask_pd(__mmask8 ymask)
     return _mm512_knot(ymask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_get_exponent(__m512 x)
-{
-    return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_get_mantissa(__m512 x)
-{
-    return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_scalef_ps(__m512 poly, __m512 quadrant)
-{
-    return _mm512_scalef_ps(poly, quadrant);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_permute_x4var_pd(__m512d t0,
-                        __m512d t1,
-                        __m512d t2,
-                        __m512d t3,
-                        __m512i index)
-{
-    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
-                          _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index),
-                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
-    __m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
-    __m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
-    return _mm512_mask_blend_pd(lut_mask, res1, res2);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
-                        __m512d t4, __m512d t5, __m512d t6, __m512d t7,
-                        __m512i index)
-{
-    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
-                          _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index),
-                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
-    __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index);
-    __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index);
-    return _mm512_mask_blend_pd(lut_mask, res1, res2);
-}
-
 /**begin repeat
  *  #vsub  = ps, pd#
  *  #type= npy_float, npy_double#
@@ -1386,34 +1123,6 @@ avx512_csquare_@vsub@(@vtype@ x)
 
 #if defined @CHK@
 
-/*
- * Vectorized Cody-Waite range reduction technique
- * Performs the reduction step x* = x - y*C in three steps:
- * 1) x* = x - y*c1
- * 2) x* = x - y*c2
- * 3) x* = x - y*c3
- * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
- */
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
-{
-    @vtype@ reduced_x = @fmadd@(y, c1, x);
-    reduced_x = @fmadd@(y, c2, reduced_x);
-    reduced_x = @fmadd@(y, c3, reduced_x);
-    return reduced_x;
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
-@isa@_in_range_mask(@vtype@ x, npy_float fmax, npy_float fmin)
-{
-    @mask@ m1 = _mm@vsize@_cmp_ps@vsub@(
-                                x, _mm@vsize@_set1_ps(fmax), _CMP_GT_OQ);
-    @mask@ m2 = _mm@vsize@_cmp_ps@vsub@(
-                                x, _mm@vsize@_set1_ps(fmin), _CMP_LT_OQ);
-    return _mm@vsize@_@or@(m1,m2);
-}
-
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 @isa@_sqrt_ps(@vtype@ x)
 {
@@ -1537,155 +1246,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
  * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
  * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
  */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
-    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
-    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
-    const npy_intp array_size = dimensions[0];
-    npy_intp num_remaining_elements = array_size;
-    @type@* ip1 = (@type@*) args[0];
-    int* ip2 = (int*) args[1];
-    @type@* op  = (@type@*) args[2];
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
-     */
-
-    npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
-        index_ip1[ii] = ii*stride_ip1;
-        index_ip2[ii] = ii*stride_ip2;
-        index_op[ii] = ii*stride_op;
-    }
-    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
-    @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
-    @vindextype@ vindex_op  = @vindexload@((@vindextype@*)&index_op[0]);
-    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
-    @vtype2@ zeros = @setzero@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype1@ x1;
-        @vtype2@ x2;
-        if (stride_ip1 == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
-        }
-        else {
-            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
-        }
-        if (stride_ip2 == 1) {
-            x2 = @vtype2_load@(load_mask, ip2);
-        }
-        else {
-            x2 = @vtype2_gather@(zeros, load_mask, vindex_ip2, ip2, 4);
-        }
-
-        @vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2));
-
-        if (stride_op == 1) {
-            _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-        }
-        else {
-            /* scatter! */
-            _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
-        }
-
-        ip1 += @num_lanes@*stride_ip1;
-        ip2 += @num_lanes@*stride_ip2;
-        op += @num_lanes@*stride_op;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
-    const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(@type@);
-    const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
-    const npy_intp array_size = dimensions[0];
-    npy_intp num_remaining_elements = array_size;
-    @type@* ip1 = (@type@*) args[0];
-    @type@* op1  = (@type@*) args[1];
-    int* op2 = (int*) args[2];
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
-     */
-
-    npy_int32 index_ip1[@num_lanes@], index_op1[@num_lanes@], index_op2[@num_lanes@];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
-        index_ip1[ii] = ii*stride_ip1;
-        index_op1[ii] = ii*stride_op1;
-        index_op2[ii] = ii*stride_op2;
-    }
-    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
-    @vindextype@ vindex_op1 = @vindexload@((@vindextype@*)&index_op1[0]);
-    @vindextype@ vindex_op2 = @vindexload@((@vindextype@*)&index_op2[0]);
-    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype1@ x1;
-        if (stride_ip1 == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
-        }
-        else {
-            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
-        }
-
-        /*
-         * The x86 instructions vpgetmant and vpgetexp do not conform
-         * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
-         * We mask these values with spmask to avoid invalid exceptions.
-         */
-        @mask@ spmask =_mm512_knot(_mm512_fpclass_@vsuffix@_mask(
-                                                x1, 0b10011111));
-        @vtype1@ out1 = _mm512_maskz_getmant_@vsuffix@(
-                                spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
-        out1 = _mm512_mask_mov_@vsuffix@(x1, spmask, out1);
-        @vtype2@ out2 = _mm512_cvt@vsuffix@_epi32(
-                            _mm512_maskz_add_@vsuffix@(spmask, _mm512_set1_@vsuffix@(1.0),
-                                _mm512_maskz_getexp_@vsuffix@(spmask, x1)));
-        if (stride_op1 == 1) {
-            _mm512_mask_storeu_@vsuffix@(op1, load_mask, out1);
-        }
-        else {
-            _mm512_mask_i32scatter_@vsuffix@(op1, load_mask, vindex_op1, out1, @scale@);
-        }
-        if (stride_op2 == 1) {
-            @vtype2_store@(op2, load_mask, out2);
-        }
-        else {
-            @vtype2_scatter@(op2, load_mask, vindex_op2, out2, 4);
-        }
-
-        ip1 += @num_lanes@*stride_ip1;
-        op1 += @num_lanes@*stride_op1;
-        op2 += @num_lanes@*stride_op2;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-
 /**begin repeat1
  *  #func = maximum, minimum#
  *  #vectorf = max, min#
@@ -1908,654 +1468,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 /**end repeat**/
 
 /**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vtype = __m256, __m512#
- * #vsize = 256, 512#
- * #BYTES = 32, 64#
- * #NUM_LANES = 8, 16#
- * #mask = __m256, __mmask16#
- * #vsub = , _mask#
- * #or_masks =_mm256_or_ps, _mm512_kor#
- * #and_masks =_mm256_and_ps, _mm512_kand#
- * #xor_masks =_mm256_xor_ps, _mm512_kxor#
- * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
- * #mask_to_int = _mm256_movemask_ps, #
- * #full_mask= 0xFF, 0xFFFF#
- * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- */
-#if defined @CHK@
-/*
- * Vectorized implementation of exp using AVX2 and AVX512:
- * 1) if x >= xmax; return INF (overflow)
- * 2) if x <= xmin; return 0.0f (underflow)
- * 3) Range reduction (using Coyd-Waite):
- *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
- * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
- *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
- *      algorithm (mini-max polynomial approximation)
- * 5) Compute exp(x) = exp(y) * 2^k
- * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
- * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
- * same x = 0xc2781e37)
- */
-
-static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_exp_FLOAT(npy_float * op,
-                npy_float * ip,
-                const npy_intp array_size,
-                const npy_intp steps)
-{
-    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
-    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
-    npy_float xmax = 88.72283935546875f;
-    npy_float xmin = -103.97208404541015625f;
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-    npy_int32 indexarr[16];
-    for (npy_int32 ii = 0; ii < 16; ii++) {
-        indexarr[ii] = ii*stride;
-    }
-
-    /* Load up frequently used constants */
-    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
-    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
-    @vtype@ exp_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_EXPf);
-    @vtype@ exp_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_EXPf);
-    @vtype@ exp_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_EXPf);
-    @vtype@ exp_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_EXPf);
-    @vtype@ exp_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_EXPf);
-    @vtype@ exp_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_EXPf);
-    @vtype@ exp_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_EXPf);
-    @vtype@ exp_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_EXPf);
-    @vtype@ exp_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_EXPf);
-    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
-    @vtype@ log2e = _mm@vsize@_set1_ps(NPY_LOG2Ef);
-    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
-    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
-    @vtype@ poly, num_poly, denom_poly, quadrant;
-    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
-
-    @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
-    @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
-    @mask@ load_mask = @isa@_get_full_load_mask_ps();
-    npy_intp num_remaining_elements = array_size;
-
-    while (num_remaining_elements > 0) {
-
-        if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
-                                                       num_lanes);
-        }
-
-        @vtype@ x;
-        if (stride == 1) {
-            x = @isa@_masked_load_ps(load_mask, ip);
-        }
-        else {
-            x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
-        }
-
-        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
-        x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask);
-
-        xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
-        xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
-        inf_mask = _mm@vsize@_cmp_ps@vsub@(x, inf, _CMP_EQ_OQ);
-        overflow_mask = @or_masks@(overflow_mask,
-                                    @xor_masks@(xmax_mask, inf_mask));
-
-        x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
-                                    @or_masks@(nan_mask, xmin_mask), xmax_mask));
-
-        quadrant = _mm@vsize@_mul_ps(x, log2e);
-
-        /* round to nearest */
-        quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
-        quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
-
-        /* Cody-Waite's range reduction algorithm */
-        x = @isa@_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);
-
-        num_poly = @fmadd@(exp_p5, x, exp_p4);
-        num_poly = @fmadd@(num_poly, x, exp_p3);
-        num_poly = @fmadd@(num_poly, x, exp_p2);
-        num_poly = @fmadd@(num_poly, x, exp_p1);
-        num_poly = @fmadd@(num_poly, x, exp_p0);
-        denom_poly = @fmadd@(exp_q2, x, exp_q1);
-        denom_poly = @fmadd@(denom_poly, x, exp_q0);
-        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
-
-        /*
-         * compute val = poly * 2^quadrant; which is same as adding the
-         * exponent of quadrant to the exponent of poly. quadrant is an int,
-         * so extracting exponent is simply extracting 8 bits.
-         */
-        poly = @isa@_scalef_ps(poly, quadrant);
-
-        /*
-         * elem > xmax; return inf
-         * elem < xmin; return 0.0f
-         * elem = +/- nan, return nan
-         */
-        poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
-        poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask);
-        poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask);
-
-        @masked_store@(op, @cvtps_epi32@(load_mask), poly);
-
-        ip += num_lanes*stride;
-        op += num_lanes;
-        num_remaining_elements -= num_lanes;
-    }
-
-    if (@mask_to_int@(overflow_mask)) {
-        npy_set_floatstatus_overflow();
-    }
-}
-
-/*
- * Vectorized implementation of log using AVX2 and AVX512
- * 1) if x < 0.0f; return -NAN (invalid input)
- * 2) Range reduction: y = x/2^k;
- *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
- * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
- *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
- *      algorithm (mini-max polynomial approximation)
- * 5) Compute log(x) = log(y) + k*ln(2)
- * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
- * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
- * x = 0x3f486945)
- */
-
-static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_log_FLOAT(npy_float * op,
-                npy_float * ip,
-                const npy_intp array_size,
-                const npy_intp steps)
-{
-    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
-    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-    npy_int32 indexarr[16];
-    for (npy_int32 ii = 0; ii < 16; ii++) {
-        indexarr[ii] = ii*stride;
-    }
-
-    /* Load up frequently used constants */
-    @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
-    @vtype@ log_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_LOGf);
-    @vtype@ log_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_LOGf);
-    @vtype@ log_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_LOGf);
-    @vtype@ log_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_LOGf);
-    @vtype@ log_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_LOGf);
-    @vtype@ log_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_LOGf);
-    @vtype@ log_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_LOGf);
-    @vtype@ log_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_LOGf);
-    @vtype@ log_q3 = _mm@vsize@_set1_ps(NPY_COEFF_Q3_LOGf);
-    @vtype@ log_q4 = _mm@vsize@_set1_ps(NPY_COEFF_Q4_LOGf);
-    @vtype@ log_q5 = _mm@vsize@_set1_ps(NPY_COEFF_Q5_LOGf);
-    @vtype@ loge2 = _mm@vsize@_set1_ps(NPY_LOGE2f);
-    @vtype@ nan = _mm@vsize@_set1_ps(NPY_NANF);
-    @vtype@ neg_nan = _mm@vsize@_set1_ps(-NPY_NANF);
-    @vtype@ neg_inf = _mm@vsize@_set1_ps(-NPY_INFINITYF);
-    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
-    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
-    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
-    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr);
-    @vtype@ poly, num_poly, denom_poly, exponent;
-
-    @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
-    @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
-    @mask@ divide_by_zero_mask = invalid_mask;
-    @mask@ load_mask = @isa@_get_full_load_mask_ps();
-    npy_intp num_remaining_elements = array_size;
-
-    while (num_remaining_elements > 0) {
-
-        if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
-                                                       num_lanes);
-        }
-
-        @vtype@ x_in;
-        if (stride == 1) {
-            x_in = @isa@_masked_load_ps(load_mask, ip);
-        }
-        else {
-            x_in  = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
-        }
-
-        negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
-        zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
-        inf_mask = _mm@vsize@_cmp_ps@vsub@(x_in, inf, _CMP_EQ_OQ);
-        nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
-        divide_by_zero_mask = @or_masks@(divide_by_zero_mask,
-                                        @and_masks@(zero_mask, load_mask));
-        invalid_mask = @or_masks@(invalid_mask, negx_mask);
-
-        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask);
-
-        /* set x = normalized mantissa */
-        exponent = @isa@_get_exponent(x);
-        x = @isa@_get_mantissa(x);
-
-        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
-        sqrt2_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
-        x = @isa@_blend(x, _mm@vsize@_add_ps(x,x), sqrt2_mask);
-        exponent = @isa@_blend(exponent,
-                               _mm@vsize@_sub_ps(exponent,ones_f), sqrt2_mask);
-
-        /* x = x - 1 */
-        x = _mm@vsize@_sub_ps(x, ones_f);
-
-        /* Polynomial approximation for log(1+x) */
-        num_poly = @fmadd@(log_p5, x, log_p4);
-        num_poly = @fmadd@(num_poly, x, log_p3);
-        num_poly = @fmadd@(num_poly, x, log_p2);
-        num_poly = @fmadd@(num_poly, x, log_p1);
-        num_poly = @fmadd@(num_poly, x, log_p0);
-        denom_poly = @fmadd@(log_q5, x, log_q4);
-        denom_poly = @fmadd@(denom_poly, x, log_q3);
-        denom_poly = @fmadd@(denom_poly, x, log_q2);
-        denom_poly = @fmadd@(denom_poly, x, log_q1);
-        denom_poly = @fmadd@(denom_poly, x, log_q0);
-        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
-        poly = @fmadd@(exponent, loge2, poly);
-
-        /*
-         * x < 0.0f; return -NAN
-         * x = +/- NAN; return NAN
-         * x = 0.0f; return -INF
-         */
-        poly = @isa@_set_masked_lanes_ps(poly, nan, nan_mask);
-        poly = @isa@_set_masked_lanes_ps(poly, neg_nan, negx_mask);
-        poly = @isa@_set_masked_lanes_ps(poly, neg_inf, zero_mask);
-        poly = @isa@_set_masked_lanes_ps(poly, inf, inf_mask);
-
-        @masked_store@(op, @cvtps_epi32@(load_mask), poly);
-
-        ip += num_lanes*stride;
-        op += num_lanes;
-        num_remaining_elements -= num_lanes;
-    }
-
-    if (@mask_to_int@(invalid_mask)) {
-        npy_set_floatstatus_invalid();
-    }
-    if (@mask_to_int@(divide_by_zero_mask)) {
-        npy_set_floatstatus_divbyzero();
-    }
-}
-#endif
-/**end repeat**/
-
-/*
- * Vectorized implementation of exp double using AVX512
- * Reference: Tang, P.T.P., "Table-driven implementation of the
- *  exponential function in IEEE floating-point
- *  arithmetic," ACM Transactions on Mathematical
- *  Software, vol. 15, pp. 144-157, 1989.
- * 1) if x > mTH_max or x is INF; return INF (overflow)
- * 2) if x < mTH_min; return 0.0f (underflow)
- * 3) if abs(x) < mTH_nearzero; return 1.0f + x
- * 4) if x is Nan; return Nan
- * 5) Range reduction:
- *    x = (32m + j)ln2 / 32 + r; r in [-ln2/64, ln2/64]
- * 6) exp(r) - 1 is approximated by a polynomial function p(r)
- *    exp(x) = 2^m(2^(j/32) + 2^(j/32)p(r));
- */
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
-static NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F void
-AVX512F_exp_DOUBLE(npy_double * op,
-                npy_double * ip,
-                const npy_intp array_size,
-                const npy_intp steps)
-{
-    npy_intp num_remaining_elements = array_size;
-    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
-    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
-    npy_int32 indexarr[8];
-    for (npy_int32 ii = 0; ii < 8; ii++) {
-        indexarr[ii] = ii*stride;
-    }
-
-    __m512d InvLn2N = _mm512_set1_pd(NPY_INV_LN2_MUL_32);
-    __m512d mShift = _mm512_set1_pd(NPY_RINT_CVT_MAGIC);
-    __m512d mNegL1 = _mm512_set1_pd(NPY_TANG_NEG_L1);
-    __m512d mNegL2 = _mm512_set1_pd(NPY_TANG_NEG_L2);
-    __m512i mMod = _mm512_set1_epi64(0x1f);
-    __m512d mA1 = _mm512_set1_pd(NPY_TANG_A1);
-    __m512d mA2 = _mm512_set1_pd(NPY_TANG_A2);
-    __m512d mA3 = _mm512_set1_pd(NPY_TANG_A3);
-    __m512d mA4 = _mm512_set1_pd(NPY_TANG_A4);
-    __m512d mA5 = _mm512_set1_pd(NPY_TANG_A5);
-    __m512d mTH_nearzero = _mm512_set1_pd(0x1p-54);
-    __m512d mTH_max = _mm512_set1_pd(0x1.62e42fefa39efp+9);
-    __m512d mTH_min = _mm512_set1_pd(-0x1.74910d52d3053p+9);
-    __m512d mTH_inf = _mm512_set1_pd(NPY_INFINITY);
-    __m512d zeros_d = _mm512_set1_pd(0.0f);
-    __m512d ones_d = _mm512_set1_pd(1.0f);
-    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
-
-    __m512d mTable_top_0 = _mm512_loadu_pd(&(EXP_Table_top[8*0]));
-    __m512d mTable_top_1 = _mm512_loadu_pd(&(EXP_Table_top[8*1]));
-    __m512d mTable_top_2 = _mm512_loadu_pd(&(EXP_Table_top[8*2]));
-    __m512d mTable_top_3 = _mm512_loadu_pd(&(EXP_Table_top[8*3]));
-    __m512d mTable_tail_0 = _mm512_loadu_pd(&(EXP_Table_tail[8*0]));
-    __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1]));
-    __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2]));
-    __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));
-
-    __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
-    __mmask8 load_mask = avx512_get_full_load_mask_pd();
-    __mmask8 xmin_mask, xmax_mask, inf_mask, nan_mask, nearzero_mask;
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < num_lanes) {
-            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
-                                                      num_lanes);
-        }
-
-        __m512d x;
-        if (1 == stride) {
-            x = avx512_masked_load_pd(load_mask, ip);
-        }
-        else {
-            x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
-        }
-
-        nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ);
-        x = avx512_set_masked_lanes_pd(x, zeros_d, nan_mask);
-        xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ);
-        xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ);
-        inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ);
-        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x),
-                                _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
-        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs),
-                                    mTH_nearzero, _CMP_LT_OQ);
-        nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
-        overflow_mask = _mm512_kor(overflow_mask,
-                                _mm512_kxor(xmax_mask, inf_mask));
-        x = avx512_set_masked_lanes_pd(x, zeros_d,
-                        _mm512_kor(_mm512_kor(nan_mask, xmin_mask),
-                            _mm512_kor(xmax_mask, nearzero_mask)));
-
-        /* z = x * 32/ln2 */
-        __m512d z = _mm512_mul_pd(x, InvLn2N);
-
-        /* round to nearest */
-        __m512d kd = _mm512_add_pd(z, mShift);
-        __m512i ki = _mm512_castpd_si512(kd);
-        kd = _mm512_sub_pd(kd, mShift);
-
-        /* r = (x + kd*mNegL1) + kd*mNegL2 */
-        __m512d r1 = _mm512_fmadd_pd(kd, mNegL1, x);
-        __m512d r2 = _mm512_mul_pd(kd, mNegL2);
-        __m512d r = _mm512_add_pd(r1,r2);
-
-        /* Polynomial approximation for exp(r) - 1 */
-        __m512d q = _mm512_fmadd_pd(mA5, r, mA4);
-        q = _mm512_fmadd_pd(q, r, mA3);
-        q = _mm512_fmadd_pd(q, r, mA2);
-        q = _mm512_fmadd_pd(q, r, mA1);
-        q = _mm512_mul_pd(q, r);
-        __m512d p = _mm512_fmadd_pd(r, q, r2);;
-        p = _mm512_add_pd(r1, p);
-
-        /* Get 2^(j/32) from lookup table */
-        __m512i j = _mm512_and_epi64(ki, mMod);
-        __m512d top = avx512_permute_x4var_pd(mTable_top_0, mTable_top_1,
-                                  mTable_top_2, mTable_top_3, j);
-        __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1,
-                                  mTable_tail_2, mTable_tail_3, j);
-
-        /*
-         * s = top + tail;
-         * exp(x) = 2^m * (top + (tail + s * p));
-         */
-        __m512d s = _mm512_add_pd(top, tail);
-        __m512d res = _mm512_fmadd_pd(s, p, tail);
-        res = _mm512_add_pd(res, top);
-        res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32)));
-
-        /* return special cases */
-        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d),
-                                        nearzero_mask);
-        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN),
-                                        nan_mask);
-        res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask);
-        res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask);
-
-        _mm512_mask_storeu_pd(op, load_mask, res);
-
-        ip += num_lanes * stride;
-        op += num_lanes;
-        num_remaining_elements -= num_lanes;
-    }
-    if (overflow_mask) {
-        npy_set_floatstatus_overflow();
-    }
-}
-#endif
-#endif
-
-/*
- * Vectorized implementation of log double using AVX512
- * Reference:
- * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions
- *     and their error analysis. No. CONF-9106103-1. Argonne National Lab.,
- *     IL (USA), 1991.
- * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm
- *     function in IEEE floating-point arithmetic." ACM Transactions on
- *     Mathematical Software (TOMS) 16.4 (1990): 378-400.
- * [3] Muller, Jean-Michel. "Elementary functions: algorithms and
- *     implementation." (2016).
- * 1) if x = 0; return -INF
- * 2) if x < 0; return NAN
- * 3) if x is INF; return INF
- * 4) if x is NAN; return NAN
- * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log()
- * 6) Range reduction:
- *    log(x) = log(2^m * z)
- *           = mln2 + log(z)
- * 7) log(z) = log(z / c_k) + log(c_k);
- *    where c_k = 1 + k/64, k = 0,1,...,64
- *    s.t. |x - c_k| <= 1/128 when x on[1,2].
- * 8) r = 2(x - c_k)/(x + c_k)
- *    log(x/c_k) = log((1 + r/2) / (1 - r/2))
- *               = p(r)
- *               = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
- */
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
-static NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F void
-AVX512F_log_DOUBLE(npy_double * op,
-                npy_double * ip,
-                const npy_intp array_size,
-                const npy_intp steps)
-{
-    npy_intp num_remaining_elements = array_size;
-    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
-    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
-    npy_int32 indexarr[8];
-    for (npy_int32 ii = 0; ii < 8; ii++) {
-        indexarr[ii] = ii*stride;
-    }
-
-    __m512d zeros_d = _mm512_set1_pd(0.0f);
-    __m512d ones_d = _mm512_set1_pd(1.0f);
-    __m512d mInf = _mm512_set1_pd(NPY_INFINITY);
-    __m512d mInv64 = (__m512d)(_mm512_set1_epi64(0x3f90000000000000));
-    __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN);
-    __m512d mNan = _mm512_set1_pd(NPY_NAN);
-    __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY);
-    __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1);
-    __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2);
-    __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3);
-    __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4);
-    __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI);
-    __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO);
-
-    __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4);
-    __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4);
-    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
-
-    /* Load lookup table data */
-    /**begin repeat
-     * #i = 0, 1, 2, 3, 4, 5, 6, 7#
-     */
-
-    __m512d mLUT_TOP_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*@i@]));
-    __m512d mLUT_TAIL_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*@i@]));
-
-    /**end repeat**/
-
-    __mmask8 load_mask = avx512_get_full_load_mask_pd();
-    __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
-    __mmask8 divide_by_zero_mask = invalid_mask;
-
-    __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask,
-             glibc_mask;
-
-    __m512d x_in;
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < num_lanes) {
-            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
-                                                      num_lanes);
-        }
-
-        if (1 == stride) {
-            x_in = avx512_masked_load_pd(load_mask, ip);
-        }
-        else {
-            x_in = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
-        }
-
-        /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */
-        __mmask8 m1 = _mm512_cmp_pd_mask(x_in, mTo_glibc_max, _CMP_LT_OQ);
-        __mmask8 m2 = _mm512_cmp_pd_mask(x_in, mTo_glibc_min, _CMP_GT_OQ);
-        glibc_mask =  m1 & m2;
-
-        if (glibc_mask != 0xFF) {
-            zero_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_EQ_OQ);
-            inf_mask = _mm512_cmp_pd_mask(x_in, mInf, _CMP_EQ_OQ);
-            negx_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_LT_OQ);
-            nan_mask = _mm512_cmp_pd_mask(x_in, x_in, _CMP_NEQ_UQ);
-
-            divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask);
-            invalid_mask = invalid_mask | negx_mask;
-
-            __m512d x = avx512_set_masked_lanes_pd(x_in, zeros_d, negx_mask);
-            __m512i ix = (__m512i)x;
-
-            /* Normalize x when it is denormal */
-            __m512i top12 = _mm512_and_epi64(ix,
-                                _mm512_set1_epi64(0xfff0000000000000));
-            denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
-                                _CMP_EQ_OQ);
-            denormal_mask = (~zero_mask) & denormal_mask;
-            ix = (__m512i)_mm512_mask_mul_pd(x, denormal_mask,
-                                    x, _mm512_set1_pd(0x1p52));
-            ix = _mm512_mask_sub_epi64(ix, denormal_mask,
-                                    ix, _mm512_set1_epi64(52ULL << 52));
-
-            /*
-             * x = 2^k * z; where z in range [1,2]
-             */
-            __m512i tmp = _mm512_sub_epi64(ix,
-                              _mm512_set1_epi64(0x3ff0000000000000));
-            __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6),
-                            _mm512_set1_epi64(0x3fULL));
-            __m512i ik = _mm512_srai_epi64(tmp, 52);
-            __m512d z = (__m512d)(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp,
-                            _mm512_set1_epi64(0xfff0000000000000))));
-            /* c = i/64 + 1 */
-            __m256i i_32 = _mm512_cvtepi64_epi32(i);
-            __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d);
-
-            /* u = 2 * (z - c) / (z + c) */
-            __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c));
-            u = _mm512_mul_pd(_mm512_set1_pd(2.0), u);
-
-            /* v = u * u */
-            __m512d v = _mm512_mul_pd(u,u);
-
-            /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */
-            __m512d res = _mm512_fmadd_pd(v, mA4, mA3);
-            res = _mm512_fmadd_pd(v, res, mA2);
-            res = _mm512_fmadd_pd(v, res, mA1);
-            res = _mm512_mul_pd(v, res);
-            res = _mm512_fmadd_pd(u, res, u);
-
-            /* Load lookup table data */
-            __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1,
-                            mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5,
-                            mLUT_TOP_6, mLUT_TOP_7, i);
-            __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1,
-                              mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5,
-                              mLUT_TAIL_6, mLUT_TAIL_7, i);
-
-            /*
-             * log(x) = k * ln2_hi + c_hi +
-             *          k * ln2_lo + c_lo +
-             *          log(z/c)
-             */
-            __m256i ik_32 = _mm512_cvtepi64_epi32(ik);
-            __m512d k = _mm512_cvtepi32_pd(ik_32);
-            __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi);
-            __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo);
-            tt = _mm512_add_pd(tt, tt2);
-            res = _mm512_add_pd(tt, res);
-
-            /* return special cases */
-            res = avx512_set_masked_lanes_pd(res, mNan, nan_mask);
-            res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask);
-            res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask);
-            res = avx512_set_masked_lanes_pd(res, mInf, inf_mask);
-
-            _mm512_mask_storeu_pd(op, load_mask, res);
-        }
-
-        /* call glibc's log func when x around 1.0f */
-        if (glibc_mask != 0) {
-            double NPY_DECL_ALIGNED(64) ip_fback[8];
-            _mm512_store_pd(ip_fback, x_in);
-
-            for (int ii = 0; ii < 8; ++ii, glibc_mask >>= 1) {
-                if (glibc_mask & 0x01) {
-                    op[ii] = npy_log(ip_fback[ii]);
-                }
-            }
-        }
-        ip += num_lanes * stride;
-        op += num_lanes;
-        num_remaining_elements -= num_lanes;
-    }
-
-    if (invalid_mask) {
-        npy_set_floatstatus_invalid();
-    }
-    if (divide_by_zero_mask) {
-        npy_set_floatstatus_divbyzero();
-    }
-}
-#endif
-#endif
-
-/**begin repeat
  * #TYPE = CFLOAT, CDOUBLE#
  * #type = npy_float, npy_double#
  * #num_lanes = 16, 8#