summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/code_generators/generate_umath.py2
-rw-r--r--numpy/core/include/numpy/npy_common.h13
-rw-r--r--numpy/core/include/numpy/npy_math.h32
-rw-r--r--numpy/core/setup_common.py5
-rw-r--r--numpy/core/src/umath/cpuid.c19
-rw-r--r--numpy/core/src/umath/loops.c.src37
-rw-r--r--numpy/core/src/umath/loops.h.src16
-rw-r--r--numpy/core/src/umath/simd.inc.src401
8 files changed, 524 insertions, 1 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index f251ed6e9..27a2ba1e1 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -697,6 +697,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.exp'),
None,
+ TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
TD(inexact, f='exp', astype={'e':'f'}),
TD(P, f='exp'),
),
@@ -718,6 +719,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.log'),
None,
+ TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
TD(inexact, f='log', astype={'e':'f'}),
TD(P, f='log'),
),
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 64aaaacff..d83080160 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -50,6 +50,12 @@
#define NPY_GCC_TARGET_AVX2
#endif
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined HAVE_LINK_AVX512F
+#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
+#else
+#define NPY_GCC_TARGET_AVX512F
+#endif
+
/*
* mark an argument (starting from 1) that must not be NULL and is not checked
* DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check
@@ -68,6 +74,13 @@
#define NPY_HAVE_SSE2_INTRINSICS
#endif
+#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX2
+#define NPY_HAVE_AVX2_INTRINSICS
+#endif
+
+#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX512F
+#define NPY_HAVE_AVX512F_INTRINSICS
+#endif
/*
* give a hint to the compiler which branch is more likely or unlikely
* to occur, e.g. rare error cases:
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 582390cdc..dfb8ff526 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -113,6 +113,38 @@ NPY_INLINE static float __npy_nzerof(void)
#define NPY_SQRT2l 1.414213562373095048801688724209698079L /* sqrt(2) */
#define NPY_SQRT1_2l 0.707106781186547524400844362104849039L /* 1/sqrt(2) */
+/*
+ * Constants used in vector implementation of exp(x)
+ */
+#define NPY_RINT_CVT_MAGICf 0x1.800000p+23f
+#define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f
+#define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f
+#define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f
+#define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f
+#define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f
+#define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f
+#define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f
+#define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f
+#define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f
+#define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f
+#define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f
+
+/*
+ * Constants used in vector implementation of log(x)
+ */
+#define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f
+#define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f
+#define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f
+#define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f
+#define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f
+#define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f
+#define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f
+#define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f
+#define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f
+#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
+#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
+#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
+
/*
* C99 double math funcs
*/
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index f837df112..a9c044da9 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -118,6 +118,7 @@ OPTIONAL_HEADERS = [
# sse headers only enabled automatically on amd64/x32 builds
"xmmintrin.h", # SSE
"emmintrin.h", # SSE2
+ "immintrin.h", # AVX
"features.h", # for glibc version linux
"xlocale.h", # see GH#8367
"dlfcn.h", # dladdr
@@ -149,6 +150,8 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
"stdio.h", "LINK_AVX"),
("__asm__ volatile", '"vpand %ymm1, %ymm2, %ymm3"',
"stdio.h", "LINK_AVX2"),
+ ("__asm__ volatile", '"vpaddd %zmm1, %zmm2, %zmm3"',
+ "stdio.h", "LINK_AVX512F"),
("__asm__ volatile", '"xgetbv"', "stdio.h", "XGETBV"),
]
@@ -165,6 +168,8 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
'attribute_target_avx'),
('__attribute__((target ("avx2")))',
'attribute_target_avx2'),
+ ('__attribute__((target ("avx512f")))',
+ 'attribute_target_avx512f'),
]
# variable attributes tested via "int %s a" % attribute
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
index 6744ceb05..ab97e7afc 100644
--- a/numpy/core/src/umath/cpuid.c
+++ b/numpy/core/src/umath/cpuid.c
@@ -11,6 +11,7 @@
#define XCR_XFEATURE_ENABLED_MASK 0x0
#define XSTATE_SSE 0x2
#define XSTATE_YMM 0x4
+#define XSTATE_ZMM 0x70
/*
* verify the OS supports avx instructions
@@ -33,6 +34,19 @@ int os_avx_support(void)
#endif
}
+static NPY_INLINE
+int os_avx512_support(void)
+{
+#if HAVE_XGETBV
+ unsigned int eax, edx;
+ unsigned int ecx = XCR_XFEATURE_ENABLED_MASK;
+ unsigned int xcr0 = XSTATE_ZMM | XSTATE_YMM | XSTATE_SSE;
+ __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));
+ return (eax & xcr0) == xcr0;
+#else
+ return 0;
+#endif
+}
/*
* Primitive cpu feature detect function
@@ -42,7 +56,10 @@ NPY_NO_EXPORT int
npy_cpu_supports(const char * feature)
{
#ifdef HAVE___BUILTIN_CPU_SUPPORTS
- if (strcmp(feature, "avx2") == 0) {
+ if (strcmp(feature, "avx512f") == 0) {
+ return __builtin_cpu_supports("avx512f") && os_avx512_support();
+ }
+ else if (strcmp(feature, "avx2") == 0) {
return __builtin_cpu_supports("avx2") && os_avx_support();
}
else if (strcmp(feature, "avx") == 0) {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 290a87a33..024d495cd 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1569,6 +1569,43 @@ NPY_NO_EXPORT void
/**end repeat**/
+/**begin repeat
+ * #func = exp, log#
+ * #scalarf = npy_expf, npy_logf#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP {
+ const npy_float in1 = *(npy_float *)ip1;
+ *(npy_float *)op1 = @scalarf@(in1);
+ }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * #isa = avx512f, avx2#
+ * #ISA = AVX512F, AVX2#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F, HAVE_ATTRIBUTE_TARGET_AVX2#
+ * #ATTR = NPY_GCC_TARGET_AVX512F, NPY_GCC_TARGET_AVX2#
+ */
+
+/**begin repeat1
+ * #func = exp, log#
+ */
+
+#if @CHK@
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+ @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
+}
+#endif
+
+/**end repeat1**/
+/**end repeat**/
/**begin repeat
* Float types
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 9dc1b7016..8dd3170e3 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -178,6 +178,22 @@ NPY_NO_EXPORT void
/**end repeat**/
/**begin repeat
+ * #func = exp, log#
+ */
+NPY_NO_EXPORT void
+FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #isa = avx512f, avx2#
+ */
+
+NPY_NO_EXPORT void
+FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
* Float types
* #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
* #c = f, f, , l#
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 4bb8569be..f5684c30b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -122,6 +122,27 @@ abs_ptrdiff(char *a, char *b)
*/
/**begin repeat
+ * #ISA = AVX2, AVX512F#
+ */
+
+/* prototypes */
+#if defined NPY_HAVE_@ISA@_INTRINSICS
+
+/**begin repeat1
+ * #func = exp, log#
+ */
+
+static void
+@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_int n);
+
+/**end repeat1**/
+#endif
+
+/**end repeat**/
+
+
+
+/**begin repeat
* Float types
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
@@ -1075,6 +1096,386 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
/**end repeat**/
+/* bunch of helper functions used in ISA_exp/log_FLOAT*/
+
+#if HAVE_ATTRIBUTE_TARGET_AVX2
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_fmadd(__m256 a, __m256 b, __m256 c)
+{
+ return _mm256_add_ps(_mm256_mul_ps(a, b), c);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_get_full_load_mask(void)
+{
+ return _mm256_set1_ps(-1.0);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+{
+ float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
+ 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+ float* addr = maskint + total_elem - num_elem;
+ return _mm256_loadu_ps(addr);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_masked_load(__m256 mask, npy_float* addr)
+{
+ return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+{
+ return _mm256_blendv_ps(x, val, mask);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_blend(__m256 x, __m256 y, __m256 ymask)
+{
+ return _mm256_blendv_ps(x, y, ymask);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_get_exponent(__m256 x)
+{
+ /*
+ * Special handling of denormals:
+ * 1) Multiply denormal elements with 2**100 (0x71800000)
+ * 2) Get the 8 bits of unbiased exponent
+ * 3) Subtract 100 from exponent of denormals
+ */
+
+ __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
+ __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
+ __m256 temp = _mm256_mul_ps(x, two_power_100);
+ x = _mm256_blendv_ps(x, temp, denormal_mask);
+
+ __m256 exp = _mm256_cvtepi32_ps(
+ _mm256_sub_epi32(
+ _mm256_srli_epi32(
+ _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E)));
+
+ __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
+ return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_get_mantissa(__m256 x)
+{
+ /*
+ * Special handling of denormals:
+ * 1) Multiply denormal elements with 2**100 (0x71800000)
+ * 2) Get the 23 bits of mantissa
+ * 3) Mantissa for denormals is not affected by the multiplication
+ */
+
+ __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
+ __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
+ __m256 temp = _mm256_mul_ps(x, two_power_100);
+ x = _mm256_blendv_ps(x, temp, denormal_mask);
+
+ __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
+ __m256i exp_126_bits = _mm256_set1_epi32(126 << 23);
+ return _mm256_castsi256_ps(
+ _mm256_or_si256(
+ _mm256_and_si256(
+ _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
+}
+#endif
+
+#if HAVE_ATTRIBUTE_TARGET_AVX512F
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_get_full_load_mask(void)
+{
+ return 0xFFFF;
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+{
+ return (0x0001 << num_elem) - 0x0001;
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_masked_load(__mmask16 mask, npy_float* addr)
+{
+ return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask)
+{
+ return _mm512_mask_blend_ps(mask, x, val);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
+{
+ return _mm512_mask_mov_ps(x, ymask, y);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_get_exponent(__m512 x)
+{
+ return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_get_mantissa(__m512 x)
+{
+ return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
+}
+#endif
+
+/**begin repeat
+ * #ISA = AVX2, AVX512F#
+ * #isa = avx2, avx512#
+ * #vtype = __m256, __m512#
+ * #vsize = 256, 512#
+ * #or = or_ps, kor#
+ * #vsub = , _mask#
+ * #mask = __m256, __mmask16#
+ * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ **/
+
+#if HAVE_ATTRIBUTE_TARGET_@ISA@
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
+@isa@_cmp_mask(@vtype@ x, npy_float fnum, int sign)
+{
+ return _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(fnum), sign);
+}
+
+NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
+{
+ @vtype@ reduced_x = @fmadd@(y, c1, x);
+ reduced_x = @fmadd@(y, c2, reduced_x);
+ reduced_x = @fmadd@(y, c3, reduced_x);
+ return reduced_x;
+}
+#endif
+/**end repeat**/
+
+/**begin repeat
+ * #ISA = AVX2, AVX512F#
+ * #isa = avx2, avx512#
+ * #vtype = __m256, __m512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #mask = __m256, __mmask16#
+ * #and_masks =_mm256_and_ps, _mm512_kand#
+ * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #mask_to_int = _mm256_movemask_ps, #
+ * #full_mask= 0xFF, 0xFFFF#
+ * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ */
+
+#if HAVE_ATTRIBUTE_TARGET_@ISA@
+
+/*
+ * Vectorized implementation of exp using AVX2 and AVX512:
+ * 1) if x >= xmax; return INF (overflow)
+ * 2) if x <= xmin; return 0.0f (underflow)
+ * 3) Range reduction (using Coyd-Waite):
+ * a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
+ * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
+ * b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
+ * algorithm (mini-max polynomial approximation)
+ * 5) Compute exp(x) = exp(y) * 2^k
+ * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
+ * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
+ * same x = 0xc2781e37)
+ */
+
+NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size)
+{
+ const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+ npy_float xmax = 88.72283935546875f;
+ npy_float xmin = -87.3365478515625f;
+
+ /* Load up frequently used constants */
+ @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
+ @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
+ @vtype@ exp_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_EXPf);
+ @vtype@ exp_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_EXPf);
+ @vtype@ exp_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_EXPf);
+ @vtype@ exp_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_EXPf);
+ @vtype@ exp_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_EXPf);
+ @vtype@ exp_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_EXPf);
+ @vtype@ exp_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_EXPf);
+ @vtype@ exp_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_EXPf);
+ @vtype@ exp_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_EXPf);
+ @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
+ @vtype@ log2e = _mm@vsize@_set1_ps(NPY_LOG2Ef);
+ @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
+ @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
+ @vtype@ poly, num_poly, denom_poly, quadrant;
+ @vtype@i exponent;
+
+ @mask@ xmax_mask, xmin_mask;
+ @mask@ load_mask = @isa@_get_full_load_mask();
+ npy_int num_remaining_elements = array_size;
+
+ while (num_remaining_elements > 0) {
+
+ if (num_remaining_elements < num_lanes)
+ load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+ num_lanes);
+ @vtype@ x = @isa@_masked_load(load_mask, ip);
+ xmax_mask = @isa@_cmp_mask(x, xmax, _CMP_GE_OQ);
+ xmin_mask = @isa@_cmp_mask(x, xmin, _CMP_LE_OQ);
+
+ x = @isa@_set_masked_lanes(x, zeros_f,
+ @and_masks@(xmax_mask,xmin_mask));
+
+ quadrant = _mm@vsize@_mul_ps(x, log2e);
+
+ /* round to nearest */
+ quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
+ quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
+
+ /* Cody-Waite's range reduction algorithm */
+ x = @isa@_range_reduction(x, quadrant,
+ codyw_c1, codyw_c2, zeros_f);
+
+ num_poly = @fmadd@(exp_p5, x, exp_p4);
+ num_poly = @fmadd@(num_poly, x, exp_p3);
+ num_poly = @fmadd@(num_poly, x, exp_p2);
+ num_poly = @fmadd@(num_poly, x, exp_p1);
+ num_poly = @fmadd@(num_poly, x, exp_p0);
+ denom_poly = @fmadd@(exp_q2, x, exp_q1);
+ denom_poly = @fmadd@(denom_poly, x, exp_q0);
+ poly = _mm@vsize@_div_ps(num_poly, denom_poly);
+
+ /*
+ * compute val = poly * 2^quadrant; which is same as adding the
+ * exponent of quadrant to the exponent of poly. quadrant is an int,
+ * so extracting exponent is simply extracting 8 bits.
+ */
+ exponent = _mm@vsize@_slli_epi32(_mm@vsize@_cvtps_epi32(quadrant), 23);
+ poly = _mm@vsize@_castsi@vsize@_ps(
+ _mm@vsize@_add_epi32(
+ _mm@vsize@_castps_si@vsize@(poly), exponent));
+
+ /* elem > xmax; return inf, elem < xmin; return 0.0f */
+ poly = @isa@_set_masked_lanes(poly, inf, xmax_mask);
+ poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask);
+
+ @masked_store@(op, @cvtps_epi32@(load_mask), poly);
+
+ ip += num_lanes;
+ op += num_lanes;
+ num_remaining_elements -= num_lanes;
+ }
+}
+
+/*
+ * Vectorized implementation of log using AVX2 and AVX512
+ * 1) if x < 0.0f; return -NAN (invalid input)
+ * 2) Range reduction: y = x/2^k;
+ * a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
+ * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
+ * b) P = 5th order and Q = 5th order polynomials obtained from Remez's
+ * algorithm (mini-max polynomial approximation)
+ * 5) Compute log(x) = log(y) + k*ln(2)
+ * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
+ * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
+ * x = 0x3f486945)
+ */
+
+NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size)
+{
+ const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+
+ /* Load up frequently used constants */
+ @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
+ @vtype@ log_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_LOGf);
+ @vtype@ log_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_LOGf);
+ @vtype@ log_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_LOGf);
+ @vtype@ log_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_LOGf);
+ @vtype@ log_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_LOGf);
+ @vtype@ log_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_LOGf);
+ @vtype@ log_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_LOGf);
+ @vtype@ log_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_LOGf);
+ @vtype@ log_q3 = _mm@vsize@_set1_ps(NPY_COEFF_Q3_LOGf);
+ @vtype@ log_q4 = _mm@vsize@_set1_ps(NPY_COEFF_Q4_LOGf);
+ @vtype@ log_q5 = _mm@vsize@_set1_ps(NPY_COEFF_Q5_LOGf);
+ @vtype@ loge2 = _mm@vsize@_set1_ps(NPY_LOGE2f);
+ @vtype@ neg_nan = _mm@vsize@_set1_ps(-NPY_NANF);
+ @vtype@ neg_inf = _mm@vsize@_set1_ps(-NPY_INFINITYF);
+ @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
+ @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+ @vtype@ poly, num_poly, denom_poly, exponent;
+
+ @mask@ inf_nan_mask, sqrt2_mask, zero_mask, negx_mask;
+ @mask@ load_mask = @isa@_get_full_load_mask();
+ npy_int num_remaining_elements = array_size;
+
+ while (num_remaining_elements > 0) {
+
+ if (num_remaining_elements < num_lanes)
+ load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+ num_lanes);
+ @vtype@ x_in = @isa@_masked_load(load_mask, ip);
+
+ negx_mask = @isa@_cmp_mask(x_in, 0.0f, _CMP_LT_OQ);
+ zero_mask = @isa@_cmp_mask(x_in, 0.0f, _CMP_EQ_OQ);
+ inf_nan_mask = @isa@_cmp_mask(x_in, FLT_MAX, _CMP_GT_OQ);
+
+ @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask);
+
+ /* set x = normalized mantissa */
+ exponent = @isa@_get_exponent(x);
+ x = @isa@_get_mantissa(x);
+
+ /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
+ sqrt2_mask = @isa@_cmp_mask(x, NPY_SQRT1_2f, _CMP_LE_OQ);
+ x = @isa@_blend(x, _mm@vsize@_add_ps(x,x), sqrt2_mask);
+ exponent = @isa@_blend(exponent,
+ _mm@vsize@_sub_ps(exponent,ones_f), sqrt2_mask);
+
+ /* x = x - 1 */
+ x = _mm@vsize@_sub_ps(x, ones_f);
+
+ /* Polynomial approximation for log(1+x) */
+ num_poly = @fmadd@(log_p5, x, log_p4);
+ num_poly = @fmadd@(num_poly, x, log_p3);
+ num_poly = @fmadd@(num_poly, x, log_p2);
+ num_poly = @fmadd@(num_poly, x, log_p1);
+ num_poly = @fmadd@(num_poly, x, log_p0);
+ denom_poly = @fmadd@(log_q5, x, log_q4);
+ denom_poly = @fmadd@(denom_poly, x, log_q3);
+ denom_poly = @fmadd@(denom_poly, x, log_q2);
+ denom_poly = @fmadd@(denom_poly, x, log_q1);
+ denom_poly = @fmadd@(denom_poly, x, log_q0);
+ poly = _mm@vsize@_div_ps(num_poly, denom_poly);
+ poly = @fmadd@(exponent, loge2, poly);
+
+ /*
+ * x < 0.0f; return -NAN
+ * x = 0.0f; return -INF
+ * x > FLT_MAX; return x
+ */
+ poly = @isa@_set_masked_lanes(poly, neg_nan, negx_mask);
+ poly = @isa@_set_masked_lanes(poly, neg_inf, zero_mask);
+ poly = @isa@_set_masked_lanes(poly, x_in, inf_nan_mask);
+
+ @masked_store@(op, @cvtps_epi32@(load_mask), poly);
+
+ ip += num_lanes;
+ op += num_lanes;
+ num_remaining_elements -= num_lanes;
+ }
+}
+#endif
+/**end repeat**/
+
/*
*****************************************************************************
** BOOL LOOPS