From 9754a207828f377654c79873e38d475bb87d98de Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 5 Mar 2019 09:13:55 -0800
Subject: ENH: vectorizing float32 implementation of np.exp & np.log

This commit implements vectorized single precision exponential and
natural log using AVX2 and AVX512.

Accuracy:

| Function | Max ULP Error | Max Relative Error |
|----------|---------------|--------------------|
| np.exp   | 2.52          | 2.1E-07            |
| np.log   | 3.83          | 2.4E-07            |

Performance:

(1) Micro-benchmarks: measured execution time of np.exp and np.log using
timeit package in python. Each function is executed 1000 times and this
is repeated 100 times. The standard deviation for all the runs was less
than 2% of their mean value and hence not included in the data. The
vectorized implementation was upto 7.6x faster than the scalar version.

| Function | NumPy1.16 | AVX2   | AVX512 | AVX2 speedup | AVX512 speedup |
| -------- | --------- | ------ | ------ | ------------ | -------------- |
| np.exp   | 0.395s    | 0.112s | 0.055s | 3.56x        | 7.25x          |
| np.log   | 0.456s    | 0.147s | 0.059s | 3.10x        | 7.64x          |

(2) Logistic regression: exp and log are heavily used in training neural
networks (as part of sigmoid activation function and loss function
respectively). This patch significantly speeds up training a logistic
regression model. As an example, we measured how much time it takes to
train a model with 15 features using 1000 training data points. We
observed a 2x speed up to train the model to achieve a loss function
error < 10E-04.

| Function       | NumPy1.16  | AVX2   | AVX512 | AVX2 speedup | AVX512 speedup |
| -------------- | ---------- | ------ | ------ | ------------ | -------------- |
| logistic.train | 121.0s     | 75.02s | 60.60s | 1.61x        | 2.02x          |
---
 numpy/core/code_generators/generate_umath.py |   2 +
 numpy/core/include/numpy/npy_common.h        |  13 +
 numpy/core/include/numpy/npy_math.h          |  32 +++
 numpy/core/setup_common.py                   |   5 +
 numpy/core/src/umath/cpuid.c                 |  19 +-
 numpy/core/src/umath/loops.c.src             |  37 +++
 numpy/core/src/umath/loops.h.src             |  16 ++
 numpy/core/src/umath/simd.inc.src            | 401 +++++++++++++++++++++++++++
 8 files changed, 524 insertions(+), 1 deletion(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index f251ed6e9..27a2ba1e1 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -697,6 +697,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.exp'),
           None,
+          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='exp', astype={'e':'f'}),
           TD(P, f='exp'),
           ),
@@ -718,6 +719,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log'),
           None,
+          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='log', astype={'e':'f'}),
           TD(P, f='log'),
           ),
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 64aaaacff..d83080160 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -50,6 +50,12 @@
 #define NPY_GCC_TARGET_AVX2
 #endif
 
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined HAVE_LINK_AVX512F
+#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
+#else
+#define NPY_GCC_TARGET_AVX512F
+#endif
+
 /*
  * mark an argument (starting from 1) that must not be NULL and is not checked
  * DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check
@@ -68,6 +74,13 @@
 #define NPY_HAVE_SSE2_INTRINSICS
 #endif
 
+#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX2
+#define NPY_HAVE_AVX2_INTRINSICS
+#endif
+
+#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX512F
+#define NPY_HAVE_AVX512F_INTRINSICS
+#endif
 /*
  * give a hint to the compiler which branch is more likely or unlikely
  * to occur, e.g. rare error cases:
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 582390cdc..dfb8ff526 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -113,6 +113,38 @@ NPY_INLINE static float __npy_nzerof(void)
 #define NPY_SQRT2l    1.414213562373095048801688724209698079L /* sqrt(2) */
 #define NPY_SQRT1_2l  0.707106781186547524400844362104849039L /* 1/sqrt(2) */
 
+/* 
+ * Constants used in vector implementation of exp(x) 
+ */
+#define NPY_RINT_CVT_MAGICf 0x1.800000p+23f
+#define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f
+#define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f
+#define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f                                 
+#define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f                                 
+#define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f                                 
+#define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f                                 
+#define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f                                 
+#define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f                                 
+#define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f                                 
+#define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f                                
+#define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f  
+
+/* 
+ * Constants used in vector implementation of log(x) 
+ */
+#define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f                          
+#define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f                          
+#define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f                          
+#define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f                          
+#define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f                          
+#define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f                          
+#define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f                          
+#define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f                          
+#define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f                          
+#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f                          
+#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f                          
+#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f 
+
 /*
  * C99 double math funcs
  */
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index f837df112..a9c044da9 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -118,6 +118,7 @@ OPTIONAL_HEADERS = [
 # sse headers only enabled automatically on amd64/x32 builds
                 "xmmintrin.h",  # SSE
                 "emmintrin.h",  # SSE2
+                "immintrin.h",  # AVX
                 "features.h",  # for glibc version linux
                 "xlocale.h",  # see GH#8367
                 "dlfcn.h", # dladdr
@@ -149,6 +150,8 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                         "stdio.h", "LINK_AVX"),
                        ("__asm__ volatile", '"vpand %ymm1, %ymm2, %ymm3"',
                         "stdio.h", "LINK_AVX2"),
+                       ("__asm__ volatile", '"vpaddd %zmm1, %zmm2, %zmm3"',
+                        "stdio.h", "LINK_AVX512F"),
                        ("__asm__ volatile", '"xgetbv"', "stdio.h", "XGETBV"),
                        ]
 
@@ -165,6 +168,8 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
                                  'attribute_target_avx'),
                                 ('__attribute__((target ("avx2")))',
                                  'attribute_target_avx2'),
+                                ('__attribute__((target ("avx512f")))',
+                                 'attribute_target_avx512f'),
                                 ]
 
 # variable attributes tested via "int %s a" % attribute
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
index 6744ceb05..ab97e7afc 100644
--- a/numpy/core/src/umath/cpuid.c
+++ b/numpy/core/src/umath/cpuid.c
@@ -11,6 +11,7 @@
 #define XCR_XFEATURE_ENABLED_MASK 0x0
 #define XSTATE_SSE 0x2
 #define XSTATE_YMM 0x4
+#define XSTATE_ZMM 0x70
 
 /*
  * verify the OS supports avx instructions
@@ -33,6 +34,19 @@ int os_avx_support(void)
 #endif
 }
 
+static NPY_INLINE
+int os_avx512_support(void)
+{
+#if HAVE_XGETBV
+    unsigned int eax, edx;
+    unsigned int ecx = XCR_XFEATURE_ENABLED_MASK;
+    unsigned int xcr0 = XSTATE_ZMM | XSTATE_YMM | XSTATE_SSE;
+    __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));
+    return (eax & xcr0) == xcr0;
+#else
+    return 0;
+#endif
+}
 
 /*
  * Primitive cpu feature detect function
@@ -42,7 +56,10 @@ NPY_NO_EXPORT int
 npy_cpu_supports(const char * feature)
 {
 #ifdef HAVE___BUILTIN_CPU_SUPPORTS
-    if (strcmp(feature, "avx2") == 0) {
+    if (strcmp(feature, "avx512f") == 0) {
+        return __builtin_cpu_supports("avx512f") && os_avx512_support();
+    }
+    else if (strcmp(feature, "avx2") == 0) {
         return __builtin_cpu_supports("avx2") && os_avx_support();
     }
     else if (strcmp(feature, "avx") == 0) {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 290a87a33..024d495cd 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1569,6 +1569,43 @@ NPY_NO_EXPORT void
 
 /**end repeat**/
 
+/**begin repeat
+ *  #func = exp, log#
+ *  #scalarf = npy_expf, npy_logf#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+	const npy_float in1 = *(npy_float *)ip1;
+	*(npy_float *)op1 = @scalarf@(in1);
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * #isa = avx512f, avx2#
+ * #ISA = AVX512F, AVX2#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F, HAVE_ATTRIBUTE_TARGET_AVX2#
+ * #ATTR = NPY_GCC_TARGET_AVX512F, NPY_GCC_TARGET_AVX2#
+ */
+
+/**begin repeat1
+ *  #func = exp, log#
+ */
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
+}
+#endif
+
+/**end repeat1**/
+/**end repeat**/
 
 /**begin repeat
  * Float types
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 9dc1b7016..8dd3170e3 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -177,6 +177,22 @@ NPY_NO_EXPORT void
 @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 /**end repeat**/
 
+/**begin repeat
+ *  #func = exp, log#
+ */
+NPY_NO_EXPORT void
+FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #isa = avx512f, avx2#
+ */
+
+NPY_NO_EXPORT void
+FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+/**end repeat**/
+
 /**begin repeat
  * Float types
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 4bb8569be..f5684c30b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -121,6 +121,27 @@ abs_ptrdiff(char *a, char *b)
  *****************************************************************************
  */
 
+/**begin repeat
+ * #ISA = AVX2, AVX512F#
+ */
+
+/* prototypes */
+#if defined NPY_HAVE_@ISA@_INTRINSICS
+
+/**begin repeat1
+ * #func = exp, log#
+ */
+
+static void
+@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_int n);
+
+/**end repeat1**/
+#endif
+
+/**end repeat**/
+
+
+
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
@@ -1075,6 +1096,386 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 
 /**end repeat**/
 
+/* bunch of helper functions used in ISA_exp/log_FLOAT*/
+
+#if HAVE_ATTRIBUTE_TARGET_AVX2
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_fmadd(__m256 a, __m256 b, __m256 c)
+{
+    return _mm256_add_ps(_mm256_mul_ps(a, b), c);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_get_full_load_mask(void)
+{
+    return _mm256_set1_ps(-1.0);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+{
+    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
+                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+    float* addr = maskint + total_elem - num_elem;
+    return _mm256_loadu_ps(addr);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_masked_load(__m256 mask, npy_float* addr)
+{
+    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+{
+    return _mm256_blendv_ps(x, val, mask);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_blend(__m256 x, __m256 y, __m256 ymask)
+{
+    return _mm256_blendv_ps(x, y, ymask);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_get_exponent(__m256 x)
+{
+    /*
+     * Special handling of denormals:
+     * 1) Multiply denormal elements with 2**100 (0x71800000)
+     * 2) Get the 8 bits of unbiased exponent
+     * 3) Subtract 100 from exponent of denormals
+     */
+
+    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
+    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
+    __m256 temp = _mm256_mul_ps(x, two_power_100);
+    x = _mm256_blendv_ps(x, temp, denormal_mask);
+
+    __m256 exp = _mm256_cvtepi32_ps(
+                    _mm256_sub_epi32(
+                        _mm256_srli_epi32(
+                            _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E)));
+
+    __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
+    return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_get_mantissa(__m256 x)
+{
+    /*
+     * Special handling of denormals:
+     * 1) Multiply denormal elements with 2**100 (0x71800000)
+     * 2) Get the 23 bits of mantissa
+     * 3) Mantissa for denormals is not affected by the multiplication
+     */
+
+    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
+    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
+    __m256 temp = _mm256_mul_ps(x, two_power_100);
+    x = _mm256_blendv_ps(x, temp, denormal_mask);
+
+    __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
+    __m256i exp_126_bits  = _mm256_set1_epi32(126 << 23);
+    return _mm256_castsi256_ps(
+                _mm256_or_si256(
+                    _mm256_and_si256(
+                        _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
+}
+#endif
+
+#if HAVE_ATTRIBUTE_TARGET_AVX512F
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_get_full_load_mask(void)
+{
+    return 0xFFFF;
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x0001 << num_elem) - 0x0001;
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_masked_load(__mmask16 mask, npy_float* addr)
+{
+    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask)
+{
+    return _mm512_mask_blend_ps(mask, x, val);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
+{
+    return _mm512_mask_mov_ps(x, ymask, y);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_get_exponent(__m512 x)
+{
+    return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_get_mantissa(__m512 x)
+{
+    return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
+}
+#endif
+
+/**begin repeat
+ * #ISA = AVX2, AVX512F#
+ * #isa = avx2, avx512#
+ * #vtype = __m256, __m512#
+ * #vsize = 256, 512#
+ * #or = or_ps, kor#
+ * #vsub = , _mask#
+ * #mask = __m256, __mmask16#
+ * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ **/
+
+#if HAVE_ATTRIBUTE_TARGET_@ISA@
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
+@isa@_cmp_mask(@vtype@ x, npy_float fnum, int sign)
+{
+    return _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(fnum), sign);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
+{
+    @vtype@ reduced_x = @fmadd@(y, c1, x);
+    reduced_x = @fmadd@(y, c2, reduced_x);
+    reduced_x = @fmadd@(y, c3, reduced_x);
+    return reduced_x;
+}
+#endif
+/**end repeat**/
+
+/**begin repeat
+ * #ISA = AVX2, AVX512F#
+ * #isa = avx2, avx512#
+ * #vtype = __m256, __m512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #mask = __m256, __mmask16#
+ * #and_masks =_mm256_and_ps, _mm512_kand#
+ * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #mask_to_int = _mm256_movemask_ps, #
+ * #full_mask= 0xFF, 0xFFFF#
+ * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ */
+
+#if HAVE_ATTRIBUTE_TARGET_@ISA@
+
+/*
+ * Vectorized implementation of exp using AVX2 and AVX512:
+ * 1) if x >= xmax; return INF (overflow)
+ * 2) if x <= xmin; return 0.0f (underflow)
+ * 3) Range reduction (using Coyd-Waite):
+ *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
+ * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
+ *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
+ *      algorithm (mini-max polynomial approximation)
+ * 5) Compute exp(x) = exp(y) * 2^k
+ * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
+ * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
+ * same x = 0xc2781e37)
+ */
+
+NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size)
+{
+    const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+    npy_float xmax = 88.72283935546875f;
+    npy_float xmin = -87.3365478515625f;
+
+    /* Load up frequently used constants */
+    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
+    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
+    @vtype@ exp_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_EXPf);
+    @vtype@ exp_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_EXPf);
+    @vtype@ exp_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_EXPf);
+    @vtype@ exp_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_EXPf);
+    @vtype@ exp_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_EXPf);
+    @vtype@ exp_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_EXPf);
+    @vtype@ exp_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_EXPf);
+    @vtype@ exp_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_EXPf);
+    @vtype@ exp_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_EXPf);
+    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
+    @vtype@ log2e = _mm@vsize@_set1_ps(NPY_LOG2Ef);
+    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
+    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
+    @vtype@ poly, num_poly, denom_poly, quadrant;
+    @vtype@i exponent;
+
+    @mask@ xmax_mask, xmin_mask;
+    @mask@ load_mask = @isa@_get_full_load_mask();
+    npy_int num_remaining_elements = array_size;
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes)
+            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+                                                         num_lanes);
+        @vtype@ x  = @isa@_masked_load(load_mask, ip);
+        xmax_mask = @isa@_cmp_mask(x, xmax, _CMP_GE_OQ);
+        xmin_mask = @isa@_cmp_mask(x, xmin, _CMP_LE_OQ);
+
+        x = @isa@_set_masked_lanes(x, zeros_f,
+                                   @and_masks@(xmax_mask,xmin_mask));
+
+        quadrant = _mm@vsize@_mul_ps(x, log2e);
+
+        /* round to nearest */
+        quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
+        quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
+
+        /* Cody-Waite's range reduction algorithm */
+        x = @isa@_range_reduction(x, quadrant,
+                                  codyw_c1, codyw_c2, zeros_f);
+
+        num_poly = @fmadd@(exp_p5, x, exp_p4);
+        num_poly = @fmadd@(num_poly, x, exp_p3);
+        num_poly = @fmadd@(num_poly, x, exp_p2);
+        num_poly = @fmadd@(num_poly, x, exp_p1);
+        num_poly = @fmadd@(num_poly, x, exp_p0);
+        denom_poly = @fmadd@(exp_q2, x, exp_q1);
+        denom_poly = @fmadd@(denom_poly, x, exp_q0);
+        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
+
+        /*
+         * compute val = poly * 2^quadrant; which is same as adding the
+         * exponent of quadrant to the exponent of poly. quadrant is an int,
+         * so extracting exponent is simply extracting 8 bits.
+         */
+        exponent = _mm@vsize@_slli_epi32(_mm@vsize@_cvtps_epi32(quadrant), 23);
+        poly = _mm@vsize@_castsi@vsize@_ps(
+                    _mm@vsize@_add_epi32(
+                        _mm@vsize@_castps_si@vsize@(poly), exponent));
+
+        /* elem > xmax; return inf, elem < xmin; return 0.0f */
+        poly = @isa@_set_masked_lanes(poly, inf, xmax_mask);
+        poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask);
+
+        @masked_store@(op, @cvtps_epi32@(load_mask), poly);
+
+        ip += num_lanes;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
+
+/*
+ * Vectorized implementation of log using AVX2 and AVX512
+ * 1) if x < 0.0f; return -NAN (invalid input)
+ * 2) Range reduction: y = x/2^k;
+ *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
+ * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
+ *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
+ *      algorithm (mini-max polynomial approximation)
+ * 5) Compute log(x) = log(y) + k*ln(2)
+ * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
+ * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
+ * x = 0x3f486945)
+ */
+
+NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size)
+{
+    const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+
+    /* Load up frequently used constants */
+    @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
+    @vtype@ log_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_LOGf);
+    @vtype@ log_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_LOGf);
+    @vtype@ log_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_LOGf);
+    @vtype@ log_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_LOGf);
+    @vtype@ log_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_LOGf);
+    @vtype@ log_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_LOGf);
+    @vtype@ log_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_LOGf);
+    @vtype@ log_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_LOGf);
+    @vtype@ log_q3 = _mm@vsize@_set1_ps(NPY_COEFF_Q3_LOGf);
+    @vtype@ log_q4 = _mm@vsize@_set1_ps(NPY_COEFF_Q4_LOGf);
+    @vtype@ log_q5 = _mm@vsize@_set1_ps(NPY_COEFF_Q5_LOGf);
+    @vtype@ loge2 = _mm@vsize@_set1_ps(NPY_LOGE2f);
+    @vtype@ neg_nan = _mm@vsize@_set1_ps(-NPY_NANF);
+    @vtype@ neg_inf = _mm@vsize@_set1_ps(-NPY_INFINITYF);
+    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
+    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+    @vtype@ poly, num_poly, denom_poly, exponent;
+
+    @mask@ inf_nan_mask, sqrt2_mask, zero_mask, negx_mask;
+    @mask@ load_mask = @isa@_get_full_load_mask();
+    npy_int num_remaining_elements = array_size;
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes)
+            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+                                                         num_lanes);
+        @vtype@ x_in  = @isa@_masked_load(load_mask, ip);
+
+        negx_mask = @isa@_cmp_mask(x_in, 0.0f, _CMP_LT_OQ);
+        zero_mask = @isa@_cmp_mask(x_in, 0.0f, _CMP_EQ_OQ);
+        inf_nan_mask = @isa@_cmp_mask(x_in, FLT_MAX, _CMP_GT_OQ);
+
+        @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask);
+
+        /* set x = normalized mantissa */
+        exponent = @isa@_get_exponent(x);
+        x = @isa@_get_mantissa(x);
+
+        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
+        sqrt2_mask = @isa@_cmp_mask(x, NPY_SQRT1_2f, _CMP_LE_OQ);
+        x = @isa@_blend(x, _mm@vsize@_add_ps(x,x), sqrt2_mask);
+        exponent = @isa@_blend(exponent,
+                               _mm@vsize@_sub_ps(exponent,ones_f), sqrt2_mask);
+
+        /* x = x - 1 */
+        x = _mm@vsize@_sub_ps(x, ones_f);
+
+        /* Polynomial approximation for log(1+x) */
+        num_poly = @fmadd@(log_p5, x, log_p4);
+        num_poly = @fmadd@(num_poly, x, log_p3);
+        num_poly = @fmadd@(num_poly, x, log_p2);
+        num_poly = @fmadd@(num_poly, x, log_p1);
+        num_poly = @fmadd@(num_poly, x, log_p0);
+        denom_poly = @fmadd@(log_q5, x, log_q4);
+        denom_poly = @fmadd@(denom_poly, x, log_q3);
+        denom_poly = @fmadd@(denom_poly, x, log_q2);
+        denom_poly = @fmadd@(denom_poly, x, log_q1);
+        denom_poly = @fmadd@(denom_poly, x, log_q0);
+        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
+        poly = @fmadd@(exponent, loge2, poly);
+
+        /*
+         * x < 0.0f; return -NAN
+         * x = 0.0f; return -INF
+         * x > FLT_MAX; return x
+         */
+        poly = @isa@_set_masked_lanes(poly, neg_nan, negx_mask);
+        poly = @isa@_set_masked_lanes(poly, neg_inf, zero_mask);
+        poly = @isa@_set_masked_lanes(poly, x_in, inf_nan_mask);
+
+        @masked_store@(op, @cvtps_epi32@(load_mask), poly);
+
+        ip += num_lanes;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
+#endif
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                           BOOL LOOPS
-- 
cgit v1.2.1


From 1352359095cbf64c6ad3426ed674eb61cf47e258 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Fri, 22 Mar 2019 10:38:14 -0700
Subject: BUG: Fixing compile time error for clang

1) Got rid of @isa@_cmp_mask helper function, since clang expects the
compare operator in _mm@vsize@_cmp_ps to be a compile time constant

2) Converted all helper functions to static functions
---
 numpy/core/src/umath/simd.inc.src | 55 ++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index f5684c30b..0fa98a68a 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -1099,19 +1099,19 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 /* bunch of helper functions used in ISA_exp/log_FLOAT*/
 
 #if HAVE_ATTRIBUTE_TARGET_AVX2
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_fmadd(__m256 a, __m256 b, __m256 c)
 {
     return _mm256_add_ps(_mm256_mul_ps(a, b), c);
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_get_full_load_mask(void)
 {
     return _mm256_set1_ps(-1.0);
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
@@ -1120,25 +1120,25 @@ avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
     return _mm256_loadu_ps(addr);
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_masked_load(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
 {
     return _mm256_blendv_ps(x, val, mask);
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_blend(__m256 x, __m256 y, __m256 ymask)
 {
     return _mm256_blendv_ps(x, y, ymask);
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_get_exponent(__m256 x)
 {
     /*
@@ -1162,7 +1162,7 @@ avx2_get_exponent(__m256 x)
     return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_get_mantissa(__m256 x)
 {
     /*
@@ -1187,43 +1187,43 @@ avx2_get_mantissa(__m256 x)
 #endif
 
 #if HAVE_ATTRIBUTE_TARGET_AVX512F
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_full_load_mask(void)
 {
     return 0xFFFF;
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x0001 << num_elem) - 0x0001;
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_masked_load(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask)
 {
     return _mm512_mask_blend_ps(mask, x, val);
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 {
     return _mm512_mask_mov_ps(x, ymask, y);
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_get_exponent(__m512 x)
 {
     return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
 }
 
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_get_mantissa(__m512 x)
 {
     return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
@@ -1242,13 +1242,7 @@ avx512_get_mantissa(__m512 x)
  **/
 
 #if HAVE_ATTRIBUTE_TARGET_@ISA@
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
-@isa@_cmp_mask(@vtype@ x, npy_float fnum, int sign)
-{
-    return _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(fnum), sign);
-}
-
-NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 @isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
 {
     @vtype@ reduced_x = @fmadd@(y, c1, x);
@@ -1266,6 +1260,7 @@ NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * #vsize = 256, 512#
  * #BYTES = 32, 64#
  * #mask = __m256, __mmask16#
+ * #vsub = , _mask#
  * #and_masks =_mm256_and_ps, _mm512_kand#
  * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
  * #mask_to_int = _mm256_movemask_ps, #
@@ -1291,7 +1286,7 @@ NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * same x = 0xc2781e37)
  */
 
-NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 @ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size)
 {
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
@@ -1327,8 +1322,8 @@ NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
                                                          num_lanes);
         @vtype@ x  = @isa@_masked_load(load_mask, ip);
-        xmax_mask = @isa@_cmp_mask(x, xmax, _CMP_GE_OQ);
-        xmin_mask = @isa@_cmp_mask(x, xmin, _CMP_LE_OQ);
+        xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
+        xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
 
         x = @isa@_set_masked_lanes(x, zeros_f,
                                    @and_masks@(xmax_mask,xmin_mask));
@@ -1388,7 +1383,7 @@ NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  * x = 0x3f486945)
  */
 
-NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 @ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size)
 {
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
@@ -1424,9 +1419,9 @@ NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                                                          num_lanes);
         @vtype@ x_in  = @isa@_masked_load(load_mask, ip);
 
-        negx_mask = @isa@_cmp_mask(x_in, 0.0f, _CMP_LT_OQ);
-        zero_mask = @isa@_cmp_mask(x_in, 0.0f, _CMP_EQ_OQ);
-        inf_nan_mask = @isa@_cmp_mask(x_in, FLT_MAX, _CMP_GT_OQ);
+        negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
+        zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
+        inf_nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, _mm@vsize@_set1_ps(FLT_MAX), _CMP_GT_OQ);
 
         @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask);
 
@@ -1435,7 +1430,7 @@ NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         x = @isa@_get_mantissa(x);
 
         /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
-        sqrt2_mask = @isa@_cmp_mask(x, NPY_SQRT1_2f, _CMP_LE_OQ);
+        sqrt2_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
         x = @isa@_blend(x, _mm@vsize@_add_ps(x,x), sqrt2_mask);
         exponent = @isa@_blend(exponent,
                                _mm@vsize@_sub_ps(exponent,ones_f), sqrt2_mask);
-- 
cgit v1.2.1


From f9d14627b36bc25aace6c78e6e5f6fe68c08bfcb Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 27 Mar 2019 13:55:37 -0700
Subject: BUG: Fixing AVX512F build issues on clang6.0

clang6.0 fails to compile this code:

__asm__ __volatile__ (
    "vpaddd %zmm1, %zmm2, %zmm3\n\t"
);

Note that this is a known issue in clang6.0. clang7.0 and gcc does not
have this problem. This fails to set the flag HAVE_LINK_AVX512F. Hence,
the AVX512F version of exp and log doesn't get built. If AVX512F is
detected during runtime, instead of choosing to run the AVX2 version, it
will end up running scalar version.
---
 numpy/core/src/umath/loops.c.src  | 23 ++++++++++++++++++-----
 numpy/core/src/umath/simd.inc.src |  8 ++++----
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 024d495cd..a9526c2ed 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1588,21 +1588,34 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
 /**begin repeat
  * #isa = avx512f, avx2#
  * #ISA = AVX512F, AVX2#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F, HAVE_ATTRIBUTE_TARGET_AVX2#
- * #ATTR = NPY_GCC_TARGET_AVX512F, NPY_GCC_TARGET_AVX2#
+ * #CHK1 = HAVE_ATTRIBUTE_TARGET_AVX512F, HAVE_ATTRIBUTE_TARGET_AVX2#
+ * #CHK2 = NPY_HAVE_AVX512F_INTRINSICS, NPY_HAVE_AVX2_INTRINSICS#
  */
 
 /**begin repeat1
  *  #func = exp, log#
+ *  #scalarf = npy_expf, npy_logf#
  */
 
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
 FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
+#if defined @CHK1@ && defined @CHK2@
     @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
-}
+#else
+    /*
+     * This is the path it would take if ISA was runtime detected, but not
+     * compiled for. It fixes the error on clang6.0 which fails to compile
+     * AVX512F version. Not sure if I like this idea, if during runtime it
+     * detects AXV512F, it will end up running the scalar version instead
+     * of AVX2.
+     */
+    UNARY_LOOP {
+	const npy_float in1 = *(npy_float *)ip1;
+	*(npy_float *)op1 = @scalarf@(in1);
+    }
 #endif
+}
 
 /**end repeat1**/
 /**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 0fa98a68a..31f11d302 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -1098,7 +1098,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 
 /* bunch of helper functions used in ISA_exp/log_FLOAT*/
 
-#if HAVE_ATTRIBUTE_TARGET_AVX2
+#if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined NPY_HAVE_AVX2_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_fmadd(__m256 a, __m256 b, __m256 c)
 {
@@ -1186,7 +1186,7 @@ avx2_get_mantissa(__m256 x)
 }
 #endif
 
-#if HAVE_ATTRIBUTE_TARGET_AVX512F
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined NPY_HAVE_AVX512F_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_full_load_mask(void)
 {
@@ -1241,7 +1241,7 @@ avx512_get_mantissa(__m512 x)
  * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
  **/
 
-#if HAVE_ATTRIBUTE_TARGET_@ISA@
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@ && defined NPY_HAVE_@ISA@_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 @isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
 {
@@ -1269,7 +1269,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * #cvtps_epi32 = _mm256_cvtps_epi32, #
  */
 
-#if HAVE_ATTRIBUTE_TARGET_@ISA@
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@ && defined NPY_HAVE_@ISA@_INTRINSICS
 
 /*
  * Vectorized implementation of exp using AVX2 and AVX512:
-- 
cgit v1.2.1


From 651e03c0019d4c4c6ca8c43cb7d7c0d344a72cc1 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 28 Mar 2019 15:07:35 -0700
Subject: BUG: Adding macro HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS

1) use __builtin_cpu_supports("avx512f") only for gcc ver >= 5

2) Introduced two new macro's:
HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS for ensuring compiler can
compile functions that use intrinsics and are compiled with avx2/avx512f
attributes
---
 numpy/core/include/numpy/npy_common.h |  4 ++++
 numpy/core/setup.py                   |  5 +++++
 numpy/core/setup_common.py            | 15 +++++++++++++++
 numpy/core/src/umath/cpuid.c          |  4 ++++
 numpy/core/src/umath/loops.c.src      |  5 ++---
 numpy/core/src/umath/simd.inc.src     | 10 +++++-----
 numpy/distutils/command/autodist.py   | 20 ++++++++++++++++++++
 numpy/distutils/command/config.py     |  6 ++++++
 8 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index d83080160..108c0a202 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -46,12 +46,16 @@
 #endif
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
 #define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
+#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
+#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
 #else
 #define NPY_GCC_TARGET_AVX2
 #endif
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined HAVE_LINK_AVX512F
 #define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
+#elif defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
+#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
 #else
 #define NPY_GCC_TARGET_AVX512F
 #endif
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2bcd17f27..9f1ecf358 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -171,6 +171,11 @@ def check_math_capabilities(config, moredefs, mathlibs):
         if config.check_gcc_function_attribute(dec, fn):
             moredefs.append((fname2def(fn), 1))
 
+    for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS:
+        if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code,
+                                                               header):
+            moredefs.append((fname2def(fn), 1))
+
     for fn in OPTIONAL_VARIABLE_ATTRIBUTES:
         if config.check_gcc_variable_attribute(fn):
             m = fn.replace("(", "_").replace(")", "_")
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index a9c044da9..885aec443 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -172,6 +172,21 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
                                  'attribute_target_avx512f'),
                                 ]
 
+# function attributes with intrinsics
+# To ensure your compiler can compile avx intrinsics with just the attributes
+# gcc 4.8.4 support attributes but not with intrisics
+# tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
+# function name will be converted to HAVE_<upper-case-name> preprocessor macro
+OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))',
+                                'attribute_target_avx2_with_intrinsics',
+                                '__m256 temp = _mm256_set1_ps(1.0)',
+                                'immintrin.h'),
+                                ('__attribute__((target("avx512f")))',
+                                'attribute_target_avx512f_with_intrinsics',
+                                '__m512 temp = _mm512_set1_ps(1.0)',
+                                'immintrin.h'),
+                                ]
+
 # variable attributes tested via "int %s a" % attribute
 OPTIONAL_VARIABLE_ATTRIBUTES = ["__thread", "__declspec(thread)"]
 
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
index ab97e7afc..51c540457 100644
--- a/numpy/core/src/umath/cpuid.c
+++ b/numpy/core/src/umath/cpuid.c
@@ -57,7 +57,11 @@ npy_cpu_supports(const char * feature)
 {
 #ifdef HAVE___BUILTIN_CPU_SUPPORTS
     if (strcmp(feature, "avx512f") == 0) {
+#if defined(__GNUC__) && (__GNUC__ < 5)
+        return 0;
+#else
         return __builtin_cpu_supports("avx512f") && os_avx512_support();
+#endif
     }
     else if (strcmp(feature, "avx2") == 0) {
         return __builtin_cpu_supports("avx2") && os_avx_support();
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index a9526c2ed..8a9cc58ab 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1588,8 +1588,7 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
 /**begin repeat
  * #isa = avx512f, avx2#
  * #ISA = AVX512F, AVX2#
- * #CHK1 = HAVE_ATTRIBUTE_TARGET_AVX512F, HAVE_ATTRIBUTE_TARGET_AVX2#
- * #CHK2 = NPY_HAVE_AVX512F_INTRINSICS, NPY_HAVE_AVX2_INTRINSICS#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
  */
 
 /**begin repeat1
@@ -1600,7 +1599,7 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
-#if defined @CHK1@ && defined @CHK2@
+#if defined @CHK@
     @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
 #else
     /*
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 31f11d302..9e491b407 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -126,7 +126,7 @@ abs_ptrdiff(char *a, char *b)
  */
 
 /* prototypes */
-#if defined NPY_HAVE_@ISA@_INTRINSICS
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 
 /**begin repeat1
  * #func = exp, log#
@@ -1098,7 +1098,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 
 /* bunch of helper functions used in ISA_exp/log_FLOAT*/
 
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined NPY_HAVE_AVX2_INTRINSICS
+#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_fmadd(__m256 a, __m256 b, __m256 c)
 {
@@ -1186,7 +1186,7 @@ avx2_get_mantissa(__m256 x)
 }
 #endif
 
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined NPY_HAVE_AVX512F_INTRINSICS
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_full_load_mask(void)
 {
@@ -1241,7 +1241,7 @@ avx512_get_mantissa(__m512 x)
  * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
  **/
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@ && defined NPY_HAVE_@ISA@_INTRINSICS
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 @isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
 {
@@ -1269,7 +1269,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * #cvtps_epi32 = _mm256_cvtps_epi32, #
  */
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@ && defined NPY_HAVE_@ISA@_INTRINSICS
 
 /*
  * Vectorized implementation of exp using AVX2 and AVX512:
@@ -1286,6 +1285,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * same x = 0xc2781e37)
  */
 
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 @ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size)
 {
diff --git a/numpy/distutils/command/autodist.py b/numpy/distutils/command/autodist.py
index d5e78963c..03fe0c64c 100644
--- a/numpy/distutils/command/autodist.py
+++ b/numpy/distutils/command/autodist.py
@@ -78,6 +78,26 @@ main()
 """ % (attribute, name)
     return cmd.try_compile(body, None, None) != 0
 
+def check_gcc_function_attribute_with_intrinsics(cmd, attribute, name, code,
+                                                include):
+    """Return True if the given function attribute is supported with
+    intrinsics."""
+    cmd._check_compiler()
+    body = """
+#include<%s>
+int %s %s(void)
+{
+    %s;
+    return 0;
+}
+
+int
+main()
+{
+    return 0;
+}
+""" % (include, attribute, name, code)
+    return cmd.try_compile(body, None, None) != 0
 def check_gcc_variable_attribute(cmd, attribute):
     """Return True if the given variable attribute is supported."""
     cmd._check_compiler()
diff --git a/numpy/distutils/command/config.py b/numpy/distutils/command/config.py
index d9b1e8488..74d6900dc 100644
--- a/numpy/distutils/command/config.py
+++ b/numpy/distutils/command/config.py
@@ -18,6 +18,7 @@ import distutils
 from numpy.distutils.exec_command import filepath_from_subprocess_output
 from numpy.distutils.mingw32ccompiler import generate_manifest
 from numpy.distutils.command.autodist import (check_gcc_function_attribute,
+                                              check_gcc_function_attribute_with_intrinsics,
                                               check_gcc_variable_attribute,
                                               check_inline,
                                               check_restrict,
@@ -424,6 +425,11 @@ int main (void)
     def check_gcc_function_attribute(self, attribute, name):
         return check_gcc_function_attribute(self, attribute, name)
 
+    def check_gcc_function_attribute_with_intrinsics(self, attribute, name,
+                                                     code, include):
+        return check_gcc_function_attribute_with_intrinsics(self, attribute,
+                                                            name, code, include)
+
     def check_gcc_variable_attribute(self, attribute):
         return check_gcc_variable_attribute(self, attribute)
 
-- 
cgit v1.2.1


From e1417a31e8d6c67faab160761d80602eb40154b4 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Fri, 29 Mar 2019 13:24:33 -0700
Subject: BUG: Fixing incomplete guards for @ISA@_exp/log_FLOAT functions

Added a missing NPY_HAVE_SSE2_INTRINSICS guard
---
 numpy/core/src/umath/loops.c.src  | 2 +-
 numpy/core/src/umath/simd.inc.src | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 8a9cc58ab..ff3b36428 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1599,7 +1599,7 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
-#if defined @CHK@
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
     @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
 #else
     /*
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 9e491b407..9a96ec3f4 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -126,7 +126,7 @@ abs_ptrdiff(char *a, char *b)
  */
 
 /* prototypes */
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
 
 /**begin repeat1
  * #func = exp, log#
-- 
cgit v1.2.1


From d3125fa94893af10597f3e0b07613c719940a543 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Fri, 19 Apr 2019 10:48:23 -0700
Subject: DOC: adding release notes for 1.17.0

---
 doc/release/1.17.0-notes.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/release/1.17.0-notes.rst b/doc/release/1.17.0-notes.rst
index fa6a132dd..87033f020 100644
--- a/doc/release/1.17.0-notes.rst
+++ b/doc/release/1.17.0-notes.rst
@@ -191,6 +191,12 @@ but with this change, you can do::
 
 thereby saving a level of indentation
 
+`numpy.exp and numpy.log` speed up for float32 implementation
+-------------------------------------------------------------
+float32 implementation of numpy.exp and numpy.log now benefit from AVX2/AVX512
+instruction set which are detected during runtime. numpy.exp has a max ulp
+error of 2.52 and numpy.log has a max ulp error or 3.83.
+
 Improve performance of ``np.pad``
 ---------------------------------
 The performance of the function has been improved for most cases by filling in
-- 
cgit v1.2.1