summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/code_generators/generate_umath.py1
-rw-r--r--numpy/core/include/numpy/npy_math.h47
-rw-r--r--numpy/core/src/umath/loops.c.src20
-rw-r--r--numpy/core/src/umath/loops.h.src6
-rw-r--r--numpy/core/src/umath/npy_simd_data.h137
-rw-r--r--numpy/core/src/umath/simd.inc.src187
-rw-r--r--numpy/core/tests/data/umath-validation-set-exp277
-rw-r--r--numpy/core/tests/test_umath.py12
-rw-r--r--numpy/core/tests/test_umath_accuracy.py17
9 files changed, 651 insertions, 53 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index c14711d16..52ae3cdd7 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -702,6 +702,7 @@ defdict = {
None,
TD('e', f='exp', astype={'e':'f'}),
TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
+ TD('d', simd=[('avx512f', 'd')]),
TD('fdg' + cmplx, f='exp'),
TD(P, f='exp'),
),
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 69e690f28..a07f49501 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -114,53 +114,6 @@ NPY_INLINE static float __npy_nzerof(void)
#define NPY_SQRT1_2l 0.707106781186547524400844362104849039L /* 1/sqrt(2) */
/*
- * Constants used in vector implementation of exp(x)
- */
-#define NPY_RINT_CVT_MAGICf 0x1.800000p+23f
-#define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f
-#define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f
-#define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f
-#define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f
-#define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f
-#define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f
-#define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f
-#define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f
-#define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f
-#define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f
-#define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f
-
-/*
- * Constants used in vector implementation of log(x)
- */
-#define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f
-#define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f
-#define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f
-#define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f
-#define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f
-#define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f
-#define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f
-#define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f
-#define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f
-#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
-#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
-#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
-/*
- * Constants used in vector implementation of sinf/cosf(x)
- */
-#define NPY_TWO_O_PIf 0x1.45f306p-1f
-#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
-#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
-#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
-#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
-#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
-#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
-#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
-#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
-#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
-#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
-#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
-#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
-/*
* Integer functions.
*/
NPY_INPLACE npy_uint npy_gcdu(npy_uint a, npy_uint b);
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 9b43824cb..eea82309c 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1558,6 +1558,15 @@ FLOAT_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, voi
/**end repeat**/
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP {
+ const npy_double in1 = *(npy_double *)ip1;
+ *(npy_double *)op1 = npy_exp(in1);
+ }
+}
+
/**begin repeat
* #isa = avx512f, fma#
* #ISA = AVX512F, FMA#
@@ -1688,6 +1697,17 @@ FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *step
/**end repeat1**/
/**end repeat**/
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ if (!run_unary_avx512f_exp_DOUBLE(args, dimensions, steps)) {
+ UNARY_LOOP {
+ const npy_double in1 = *(npy_double *)ip1;
+ *(npy_double *)op1 = npy_exp(in1);
+ }
+ }
+}
+
/**begin repeat
* Float types
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index e9d0b4c62..50a7ccfee 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -196,6 +196,12 @@ NPY_NO_EXPORT void
/**end repeat1**/
/**end repeat**/
+NPY_NO_EXPORT void
+DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
/**begin repeat
* #func = sin, cos, exp, log#
*/
diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h
new file mode 100644
index 000000000..36c8b6c03
--- /dev/null
+++ b/numpy/core/src/umath/npy_simd_data.h
@@ -0,0 +1,137 @@
+#ifndef __NPY_SIMD_DATA_H_
+#define __NPY_SIMD_DATA_H_
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+/*
+ * Constants used in vector implementation of float64 exp(x)
+ */
+#define NPY_RINT_CVT_MAGIC 0x1.8p52
+#define NPY_INV_LN2_MUL_32 0x1.71547652b82fep+5
+#define NPY_TANG_NEG_L1 -0x1.62e42fefp-6
+#define NPY_TANG_NEG_L2 -0x1.473de6af278edp-39
+#define NPY_TANG_A1 0x1p-1
+#define NPY_TANG_A2 0x1.5555555548f7cp-3
+#define NPY_TANG_A3 0x1.5555555545d4ep-5
+#define NPY_TANG_A4 0x1.11115b7aa905ep-7
+#define NPY_TANG_A5 0x1.6c1728d739765p-10
+
+/* Lookup table for 2^(j/32) */
+static npy_uint64 EXP_Table_top[32] = {
+ 0x3FF0000000000000,
+ 0x3FF059B0D3158540,
+ 0x3FF0B5586CF98900,
+ 0x3FF11301D0125B40,
+ 0x3FF172B83C7D5140,
+ 0x3FF1D4873168B980,
+ 0x3FF2387A6E756200,
+ 0x3FF29E9DF51FDEC0,
+ 0x3FF306FE0A31B700,
+ 0x3FF371A7373AA9C0,
+ 0x3FF3DEA64C123400,
+ 0x3FF44E0860618900,
+ 0x3FF4BFDAD5362A00,
+ 0x3FF5342B569D4F80,
+ 0x3FF5AB07DD485400,
+ 0x3FF6247EB03A5580,
+ 0x3FF6A09E667F3BC0,
+ 0x3FF71F75E8EC5F40,
+ 0x3FF7A11473EB0180,
+ 0x3FF82589994CCE00,
+ 0x3FF8ACE5422AA0C0,
+ 0x3FF93737B0CDC5C0,
+ 0x3FF9C49182A3F080,
+ 0x3FFA5503B23E2540,
+ 0x3FFAE89F995AD380,
+ 0x3FFB7F76F2FB5E40,
+ 0x3FFC199BDD855280,
+ 0x3FFCB720DCEF9040,
+ 0x3FFD5818DCFBA480,
+ 0x3FFDFC97337B9B40,
+ 0x3FFEA4AFA2A490C0,
+ 0x3FFF50765B6E4540,
+};
+
+static npy_uint64 EXP_Table_tail[32] = {
+ 0x0000000000000000,
+ 0x3D0A1D73E2A475B4,
+ 0x3CEEC5317256E308,
+ 0x3CF0A4EBBF1AED93,
+ 0x3D0D6E6FBE462876,
+ 0x3D053C02DC0144C8,
+ 0x3D0C3360FD6D8E0B,
+ 0x3D009612E8AFAD12,
+ 0x3CF52DE8D5A46306,
+ 0x3CE54E28AA05E8A9,
+ 0x3D011ADA0911F09F,
+ 0x3D068189B7A04EF8,
+ 0x3D038EA1CBD7F621,
+ 0x3CBDF0A83C49D86A,
+ 0x3D04AC64980A8C8F,
+ 0x3CD2C7C3E81BF4B7,
+ 0x3CE921165F626CDD,
+ 0x3D09EE91B8797785,
+ 0x3CDB5F54408FDB37,
+ 0x3CF28ACF88AFAB35,
+ 0x3CFB5BA7C55A192D,
+ 0x3D027A280E1F92A0,
+ 0x3CF01C7C46B071F3,
+ 0x3CFC8B424491CAF8,
+ 0x3D06AF439A68BB99,
+ 0x3CDBAA9EC206AD4F,
+ 0x3CFC2220CB12A092,
+ 0x3D048A81E5E8F4A5,
+ 0x3CDC976816BAD9B8,
+ 0x3CFEB968CAC39ED3,
+ 0x3CF9858F73A18F5E,
+ 0x3C99D3E12DD8A18B,
+};
+#endif
+
+/*
+ * Constants used in vector implementation of exp(x)
+ */
+#define NPY_RINT_CVT_MAGICf 0x1.800000p+23f
+#define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f
+#define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f
+#define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f
+#define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f
+#define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f
+#define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f
+#define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f
+#define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f
+#define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f
+#define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f
+#define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f
+
+/*
+ * Constants used in vector implementation of log(x)
+ */
+#define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f
+#define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f
+#define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f
+#define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f
+#define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f
+#define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f
+#define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f
+#define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f
+#define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f
+#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
+#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
+#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
+/*
+ * Constants used in vector implementation of sinf/cosf(x)
+ */
+#define NPY_TWO_O_PIf 0x1.45f306p-1f
+#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
+#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
+#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
+#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
+#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
+#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
+#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
+#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
+#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
+#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
+#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
+#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
+
+#endif
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 4265476b5..106c7e7c9 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -18,6 +18,7 @@
#include "lowlevel_strided_loops.h"
#include "numpy/npy_common.h"
#include "numpy/npy_math.h"
+#include "npy_simd_data.h"
#ifdef NPY_HAVE_SSE2_INTRINSICS
#include <emmintrin.h>
#if !defined(_MSC_VER) || _MSC_VER >= 1600
@@ -387,6 +388,25 @@ run_unary_@isa@_sincos_FLOAT(char **args, npy_intp const *dimensions, npy_intp c
/**end repeat**/
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
+AVX512F_exp_DOUBLE(npy_double *, npy_double *, const npy_intp n, const npy_intp stride);
+#endif
+static NPY_INLINE int
+run_unary_avx512f_exp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+ if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), 64)) {
+ AVX512F_exp_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
+ return 1;
+ }
+ else
+ return 0;
+#endif
+#endif
+ return 0;
+}
/**begin repeat
* Float types
@@ -1695,6 +1715,22 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
{
return _mm512_scalef_ps(poly, quadrant);
}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_permute_x4var_pd(__m512d t0,
+ __m512d t1,
+ __m512d t2,
+ __m512d t3,
+ __m512i index)
+{
+
+ __mmask8 lut_mask = _mm512_cmp_epi64_mask(index, _mm512_set1_epi64(15),
+ _MM_CMPINT_GT);
+ __m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
+ __m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
+ return _mm512_mask_blend_pd(lut_mask, res1, res2);
+}
+
/**begin repeat
* #vsub = ps, pd#
* #type= npy_float, npy_double#
@@ -2654,6 +2690,157 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
#endif
/**end repeat**/
+/*
+ * Vectorized implementation of exp double using AVX512
+ * Reference: Tang, P.T.P., "Table-driven implementation of the
+ * exponential function in IEEE floating-point
+ * arithmetic," ACM Transactions on Mathematical
+ * Software, vol. 15, pp. 144-157, 1989.
+ * 1) if x > mTH_max or x is INF; return INF (overflow)
+ * 2) if x < mTH_min; return 0.0f (underflow)
+ * 3) if abs(x) < mTH_nearzero; return 1.0f + x
+ * 4) if x is Nan; return Nan
+ * 5) Range reduction:
+ * x = (32m + j)ln2 / 32 + r; r in [-ln2/64, ln2/64]
+ * 6) exp(r) - 1 is approximated by a polynomial function p(r)
+ * exp(x) = 2^m(2^(j/32) + 2^(j/32)p(r));
+ */
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F void
+AVX512F_exp_DOUBLE(npy_double * op,
+ npy_double * ip,
+ const npy_intp array_size,
+ const npy_intp steps)
+{
+ npy_intp num_remaining_elements = array_size;
+ const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
+ const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
+ npy_int32 indexarr[8];
+ for (npy_int32 ii = 0; ii < 8; ii++) {
+ indexarr[ii] = ii*stride;
+ }
+
+ __m512d InvLn2N = _mm512_set1_pd(NPY_INV_LN2_MUL_32);
+ __m512d mShift = _mm512_set1_pd(NPY_RINT_CVT_MAGIC);
+ __m512d mNegL1 = _mm512_set1_pd(NPY_TANG_NEG_L1);
+ __m512d mNegL2 = _mm512_set1_pd(NPY_TANG_NEG_L2);
+ __m512i mMod = _mm512_set1_epi64(0x1f);
+ __m512d mA1 = _mm512_set1_pd(NPY_TANG_A1);
+ __m512d mA2 = _mm512_set1_pd(NPY_TANG_A2);
+ __m512d mA3 = _mm512_set1_pd(NPY_TANG_A3);
+ __m512d mA4 = _mm512_set1_pd(NPY_TANG_A4);
+ __m512d mA5 = _mm512_set1_pd(NPY_TANG_A5);
+ __m512d mTH_nearzero = _mm512_set1_pd(0x1p-54);
+ __m512d mTH_max = _mm512_set1_pd(0x1.62e42fefa39efp+9);
+ __m512d mTH_min = _mm512_set1_pd(-0x1.74910d52d3053p+9);
+ __m512d mTH_inf = _mm512_set1_pd(NPY_INFINITY);
+ __m512d zeros_d = _mm512_set1_pd(0.0f);
+ __m512d ones_d = _mm512_set1_pd(1.0f);
+ __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
+
+ __m512d mTable_top_0 = _mm512_loadu_pd(&(EXP_Table_top[8*0]));
+ __m512d mTable_top_1 = _mm512_loadu_pd(&(EXP_Table_top[8*1]));
+ __m512d mTable_top_2 = _mm512_loadu_pd(&(EXP_Table_top[8*2]));
+ __m512d mTable_top_3 = _mm512_loadu_pd(&(EXP_Table_top[8*3]));
+ __m512d mTable_tail_0 = _mm512_loadu_pd(&(EXP_Table_tail[8*0]));
+ __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1]));
+ __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2]));
+ __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));
+
+ __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+ __mmask8 load_mask = avx512_get_full_load_mask_pd();
+ __mmask8 xmin_mask, xmax_mask, inf_mask, nan_mask, nearzero_mask;
+
+ while (num_remaining_elements > 0) {
+ if (num_remaining_elements < num_lanes) {
+ load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
+ num_lanes);
+ }
+
+ __m512d x;
+ if (1 == stride) {
+ x = avx512_masked_load_pd(load_mask, ip);
+ }
+ else {
+ x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
+ }
+
+ nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ);
+ x = avx512_set_masked_lanes_pd(x, zeros_d, nan_mask);
+ xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ);
+ xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ);
+ inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ);
+ __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x),
+ _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
+ nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs),
+ mTH_nearzero, _CMP_LT_OQ);
+ nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
+ overflow_mask = _mm512_kor(overflow_mask,
+ _mm512_kxor(xmax_mask, inf_mask));
+ x = avx512_set_masked_lanes_pd(x, zeros_d,
+ _mm512_kor(_mm512_kor(nan_mask, xmin_mask),
+ _mm512_kor(xmax_mask, nearzero_mask)));
+
+ /* z = x * 32/ln2 */
+ __m512d z = _mm512_mul_pd(x, InvLn2N);
+
+ /* round to nearest */
+ __m512d kd = _mm512_add_pd(z, mShift);
+ __m512i ki = _mm512_castpd_si512(kd);
+ kd = _mm512_sub_pd(kd, mShift);
+
+ /* r = (x + kd*mNegL1) + kd*mNegL2 */
+ __m512d r1 = _mm512_fmadd_pd(kd, mNegL1, x);
+ __m512d r2 = _mm512_mul_pd(kd, mNegL2);
+ __m512d r = _mm512_add_pd(r1,r2);
+
+ /* Polynomial approximation for exp(r) - 1 */
+ __m512d q = _mm512_fmadd_pd(mA5, r, mA4);
+ q = _mm512_fmadd_pd(q, r, mA3);
+ q = _mm512_fmadd_pd(q, r, mA2);
+ q = _mm512_fmadd_pd(q, r, mA1);
+ q = _mm512_mul_pd(q, r);
+ __m512d p = _mm512_fmadd_pd(r, q, r2);;
+ p = _mm512_add_pd(r1, p);
+
+ /* Get 2^(j/32) from lookup table */
+ __m512i j = _mm512_and_epi64(ki, mMod);
+ __m512d top = avx512_permute_x4var_pd(mTable_top_0, mTable_top_1,
+ mTable_top_2, mTable_top_3, j);
+ __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1,
+ mTable_tail_2, mTable_tail_3, j);
+
+ /*
+ * s = top + tail;
+ * exp(x) = 2^m * (top + (tail + s * p));
+ */
+ __m512d s = _mm512_add_pd(top, tail);
+ __m512d res = _mm512_fmadd_pd(s, p, tail);
+ res = _mm512_add_pd(res, top);
+ res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32)));
+
+ /* return special cases */
+ res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d),
+ nearzero_mask);
+ res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN),
+ nan_mask);
+ res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask);
+ res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask);
+
+ _mm512_mask_storeu_pd(op, load_mask, res);
+
+ ip += num_lanes * stride;
+ op += num_lanes;
+ num_remaining_elements -= num_lanes;
+ }
+ if (overflow_mask) {
+ npy_set_floatstatus_overflow();
+ }
+}
+#endif
+#endif
+
/**begin repeat
* #TYPE = CFLOAT, CDOUBLE#
* #type = npy_float, npy_double#
diff --git a/numpy/core/tests/data/umath-validation-set-exp b/numpy/core/tests/data/umath-validation-set-exp
index 1b2cc9ce4..7c5ef3b33 100644
--- a/numpy/core/tests/data/umath-validation-set-exp
+++ b/numpy/core/tests/data/umath-validation-set-exp
@@ -133,3 +133,280 @@ np.float32,0xc29b43d5,0x077ffffc,3
np.float32,0xc1e61ff7,0x2ab504f5,3
np.float32,0xc2867878,0x0effff15,3
np.float32,0xc2a2324a,0x04fffff4,3
+#float64
+## near zero ##
+np.float64,0x8000000000000000,0x3ff0000000000000,1
+np.float64,0x8010000000000000,0x3ff0000000000000,1
+np.float64,0x8000000000000001,0x3ff0000000000000,1
+np.float64,0x8360000000000000,0x3ff0000000000000,1
+np.float64,0x9a70000000000000,0x3ff0000000000000,1
+np.float64,0xb9b0000000000000,0x3ff0000000000000,1
+np.float64,0xb810000000000000,0x3ff0000000000000,1
+np.float64,0xbc30000000000000,0x3ff0000000000000,1
+np.float64,0xb6a0000000000000,0x3ff0000000000000,1
+np.float64,0x0000000000000000,0x3ff0000000000000,1
+np.float64,0x0010000000000000,0x3ff0000000000000,1
+np.float64,0x0000000000000001,0x3ff0000000000000,1
+np.float64,0x0360000000000000,0x3ff0000000000000,1
+np.float64,0x1a70000000000000,0x3ff0000000000000,1
+np.float64,0x3c30000000000000,0x3ff0000000000000,1
+np.float64,0x36a0000000000000,0x3ff0000000000000,1
+np.float64,0x39b0000000000000,0x3ff0000000000000,1
+np.float64,0x3810000000000000,0x3ff0000000000000,1
+## underflow ##
+np.float64,0xc0c6276800000000,0x0000000000000000,1
+np.float64,0xc0c62d918ce2421d,0x0000000000000000,1
+np.float64,0xc0c62d918ce2421e,0x0000000000000000,1
+np.float64,0xc0c62d91a0000000,0x0000000000000000,1
+np.float64,0xc0c62d9180000000,0x0000000000000000,1
+np.float64,0xc0c62dea45ee3e06,0x0000000000000000,1
+np.float64,0xc0c62dea45ee3e07,0x0000000000000000,1
+np.float64,0xc0c62dea40000000,0x0000000000000000,1
+np.float64,0xc0c62dea60000000,0x0000000000000000,1
+np.float64,0xc0875f1120000000,0x0000000000000000,1
+np.float64,0xc0875f113c30b1c8,0x0000000000000000,1
+np.float64,0xc0875f1140000000,0x0000000000000000,1
+np.float64,0xc093480000000000,0x0000000000000000,1
+np.float64,0xffefffffffffffff,0x0000000000000000,1
+np.float64,0xc7efffffe0000000,0x0000000000000000,1
+## overflow ##
+np.float64,0x40862e52fefa39ef,0x7ff0000000000000,1
+np.float64,0x40872e42fefa39ef,0x7ff0000000000000,1
+## +/- INF, +/- NAN ##
+np.float64,0x7ff0000000000000,0x7ff0000000000000,1
+np.float64,0xfff0000000000000,0x0000000000000000,1
+np.float64,0x7ff8000000000000,0x7ff8000000000000,1
+np.float64,0xfff8000000000000,0xfff8000000000000,1
+## output denormal ##
+np.float64,0xc087438520000000,0x0000000000000001,1
+np.float64,0xc08743853f2f4461,0x0000000000000001,1
+np.float64,0xc08743853f2f4460,0x0000000000000001,1
+np.float64,0xc087438540000000,0x0000000000000001,1
+## between -745.13321910 and 709.78271289 ##
+np.float64,0xbff760cd14774bd9,0x3fcdb14ced00ceb6,1
+np.float64,0xbff760cd20000000,0x3fcdb14cd7993879,1
+np.float64,0xbff760cd00000000,0x3fcdb14d12fbd264,1
+np.float64,0xc07f1cf360000000,0x130c1b369af14fda,1
+np.float64,0xbeb0000000000000,0x3feffffe00001000,1
+np.float64,0xbd70000000000000,0x3fefffffffffe000,1
+np.float64,0xc084fd46e5c84952,0x0360000000000139,1
+np.float64,0xc084fd46e5c84953,0x035ffffffffffe71,1
+np.float64,0xc084fd46e0000000,0x0360000b9096d32c,1
+np.float64,0xc084fd4700000000,0x035fff9721d12104,1
+np.float64,0xc086232bc0000000,0x0010003af5e64635,1
+np.float64,0xc086232bdd7abcd2,0x001000000000007c,1
+np.float64,0xc086232bdd7abcd3,0x000ffffffffffe7c,1
+np.float64,0xc086232be0000000,0x000ffffaf57a6fc9,1
+np.float64,0xc086233920000000,0x000fe590e3b45eb0,1
+np.float64,0xc086233938000000,0x000fe56133493c57,1
+np.float64,0xc086233940000000,0x000fe5514deffbbc,1
+np.float64,0xc086234c98000000,0x000fbf1024c32ccb,1
+np.float64,0xc086234ca0000000,0x000fbf0065bae78d,1
+np.float64,0xc086234c80000000,0x000fbf3f623a7724,1
+np.float64,0xc086234ec0000000,0x000fbad237c846f9,1
+np.float64,0xc086234ec8000000,0x000fbac27cfdec97,1
+np.float64,0xc086234ee0000000,0x000fba934cfd3dc2,1
+np.float64,0xc086234ef0000000,0x000fba73d7f618d9,1
+np.float64,0xc086234f00000000,0x000fba54632dddc0,1
+np.float64,0xc0862356e0000000,0x000faae0945b761a,1
+np.float64,0xc0862356f0000000,0x000faac13eb9a310,1
+np.float64,0xc086235700000000,0x000faaa1e9567b0a,1
+np.float64,0xc086236020000000,0x000f98cd75c11ed7,1
+np.float64,0xc086236ca0000000,0x000f8081b4d93f89,1
+np.float64,0xc086236cb0000000,0x000f8062b3f4d6c5,1
+np.float64,0xc086236cc0000000,0x000f8043b34e6f8c,1
+np.float64,0xc086238d98000000,0x000f41220d9b0d2c,1
+np.float64,0xc086238da0000000,0x000f4112cc80a01f,1
+np.float64,0xc086238d80000000,0x000f414fd145db5b,1
+np.float64,0xc08624fd00000000,0x000cbfce8ea1e6c4,1
+np.float64,0xc086256080000000,0x000c250747fcd46e,1
+np.float64,0xc08626c480000000,0x000a34f4bd975193,1
+np.float64,0xbf50000000000000,0x3feff800ffeaac00,1
+np.float64,0xbe10000000000000,0x3fefffffff800000,1
+np.float64,0xbcd0000000000000,0x3feffffffffffff8,1
+np.float64,0xc055d589e0000000,0x38100004bf94f63e,1
+np.float64,0xc055d58a00000000,0x380ffff97f292ce8,1
+np.float64,0xbfd962d900000000,0x3fe585a4b00110e1,1
+np.float64,0x3ff4bed280000000,0x400d411e7a58a303,1
+np.float64,0x3fff0b3620000000,0x401bd7737ffffcf3,1
+np.float64,0x3ff0000000000000,0x4005bf0a8b145769,1
+np.float64,0x3eb0000000000000,0x3ff0000100000800,1
+np.float64,0x3d70000000000000,0x3ff0000000001000,1
+np.float64,0x40862e42e0000000,0x7fefff841808287f,1
+np.float64,0x40862e42fefa39ef,0x7fefffffffffff2a,1
+np.float64,0x40862e0000000000,0x7feef85a11e73f2d,1
+np.float64,0x4000000000000000,0x401d8e64b8d4ddae,1
+np.float64,0x4009242920000000,0x40372a52c383a488,1
+np.float64,0x4049000000000000,0x44719103e4080b45,1
+np.float64,0x4008000000000000,0x403415e5bf6fb106,1
+np.float64,0x3f50000000000000,0x3ff00400800aab55,1
+np.float64,0x3e10000000000000,0x3ff0000000400000,1
+np.float64,0x3cd0000000000000,0x3ff0000000000004,1
+np.float64,0x40562e40a0000000,0x47effed088821c3f,1
+np.float64,0x40562e42e0000000,0x47effff082e6c7ff,1
+np.float64,0x40562e4300000000,0x47f00000417184b8,1
+np.float64,0x3fe8000000000000,0x4000ef9db467dcf8,1
+np.float64,0x402b12e8d4f33589,0x412718f68c71a6fe,1
+np.float64,0x402b12e8d4f3358a,0x412718f68c71a70a,1
+np.float64,0x402b12e8c0000000,0x412718f59a7f472e,1
+np.float64,0x402b12e8e0000000,0x412718f70c0eac62,1
+##use 1th entry
+np.float64,0x40631659AE147CB4,0x4db3a95025a4890f,1
+np.float64,0xC061B87D2E85A4E2,0x332640c8e2de2c51,1
+np.float64,0x405A4A50BE243AF4,0x496a45e4b7f0339a,1
+np.float64,0xC0839898B98EC5C6,0x0764027828830df4,1
+#use 2th entry
+np.float64,0xC072428C44B6537C,0x2596ade838b96f3e,1
+np.float64,0xC053057C5E1AE9BF,0x3912c8fad18fdadf,1
+np.float64,0x407E89C78328BAA3,0x6bfe35d5b9a1a194,1
+np.float64,0x4083501B6DD87112,0x77a855503a38924e,1
+#use 3th entry
+np.float64,0x40832C6195F24540,0x7741e73c80e5eb2f,1
+np.float64,0xC083D4CD557C2EC9,0x06b61727c2d2508e,1
+np.float64,0x400C48F5F67C99BD,0x404128820f02b92e,1
+np.float64,0x4056E36D9B2DF26A,0x4830f52ff34a8242,1
+#use 4th entry
+np.float64,0x4080FF700D8CBD06,0x70fa70df9bc30f20,1
+np.float64,0x406C276D39E53328,0x543eb8e20a8f4741,1
+np.float64,0xC070D6159BBD8716,0x27a4a0548c904a75,1
+np.float64,0xC052EBCF8ED61F83,0x391c0e92368d15e4,1
+#use 5th entry
+np.float64,0xC061F892A8AC5FBE,0x32f807a89efd3869,1
+np.float64,0x4021D885D2DBA085,0x40bd4dc86d3e3270,1
+np.float64,0x40767AEEEE7D4FCF,0x605e22851ee2afb7,1
+np.float64,0xC0757C5D75D08C80,0x20f0751599b992a2,1
+#use 6th entry
+np.float64,0x405ACF7A284C4CE3,0x499a4e0b7a27027c,1
+np.float64,0xC085A6C9E80D7AF5,0x0175914009d62ec2,1
+np.float64,0xC07E4C02F86F1DAE,0x1439269b29a9231e,1
+np.float64,0x4080D80F9691CC87,0x7088a6cdafb041de,1
+#use 7th entry
+np.float64,0x407FDFD84FBA0AC1,0x6deb1ae6f9bc4767,1
+np.float64,0x40630C06A1A2213D,0x4dac7a9d51a838b7,1
+np.float64,0x40685FDB30BB8B4F,0x5183f5cc2cac9e79,1
+np.float64,0x408045A2208F77F4,0x6ee299e08e2aa2f0,1
+#use 8th entry
+np.float64,0xC08104E391F5078B,0x0ed397b7cbfbd230,1
+np.float64,0xC031501CAEFAE395,0x3e6040fd1ea35085,1
+np.float64,0xC079229124F6247C,0x1babf4f923306b1e,1
+np.float64,0x407FB65F44600435,0x6db03beaf2512b8a,1
+#use 9th entry
+np.float64,0xC07EDEE8E8E8A5AC,0x136536cec9cbef48,1
+np.float64,0x4072BB4086099A14,0x5af4d3c3008b56cc,1
+np.float64,0x4050442A2EC42CB4,0x45cd393bd8fad357,1
+np.float64,0xC06AC28FB3D419B4,0x2ca1b9d3437df85f,1
+#use 10th entry
+np.float64,0x40567FC6F0A68076,0x480c977fd5f3122e,1
+np.float64,0x40620A2F7EDA59BB,0x4cf278e96f4ce4d7,1
+np.float64,0xC085044707CD557C,0x034aad6c968a045a,1
+np.float64,0xC07374EA5AC516AA,0x23dd6afdc03e83d5,1
+#use 11th entry
+np.float64,0x4073CC95332619C1,0x5c804b1498bbaa54,1
+np.float64,0xC0799FEBBE257F31,0x1af6a954c43b87d2,1
+np.float64,0x408159F19EA424F6,0x7200858efcbfc84d,1
+np.float64,0x404A81F6F24C0792,0x44b664a07ce5bbfa,1
+#use 12th entry
+np.float64,0x40295FF1EFB9A741,0x4113c0e74c52d7b0,1
+np.float64,0x4073975F4CC411DA,0x5c32be40b4fec2c1,1
+np.float64,0x406E9DE52E82A77E,0x56049c9a3f1ae089,1
+np.float64,0x40748C2F52560ED9,0x5d93bc14fd4cd23b,1
+#use 13th entry
+np.float64,0x4062A553CDC4D04C,0x4d6266bfde301318,1
+np.float64,0xC079EC1D63598AB7,0x1a88cb184dab224c,1
+np.float64,0xC0725C1CB3167427,0x25725b46f8a081f6,1
+np.float64,0x407888771D9B45F9,0x6353b1ec6bd7ce80,1
+#use 14th entry
+np.float64,0xC082CBA03AA89807,0x09b383723831ce56,1
+np.float64,0xC083A8961BB67DD7,0x0735b118d5275552,1
+np.float64,0xC076BC6ECA12E7E3,0x1f2222679eaef615,1
+np.float64,0xC072752503AA1A5B,0x254eb832242c77e1,1
+#use 15th entry
+np.float64,0xC058800792125DEC,0x371882372a0b48d4,1
+np.float64,0x4082909FD863E81C,0x7580d5f386920142,1
+np.float64,0xC071616F8FB534F9,0x26dbe20ef64a412b,1
+np.float64,0x406D1AB571CAA747,0x54ee0d55cb38ac20,1
+#use 16th entry
+np.float64,0x406956428B7DAD09,0x52358682c271237f,1
+np.float64,0xC07EFC2D9D17B621,0x133b3e77c27a4d45,1
+np.float64,0xC08469BAC5BA3CCA,0x050863e5f42cc52f,1
+np.float64,0x407189D9626386A5,0x593cb1c0b3b5c1d3,1
+#use 17th entry
+np.float64,0x4077E652E3DEB8C6,0x6269a10dcbd3c752,1
+np.float64,0x407674C97DB06878,0x605485dcc2426ec2,1
+np.float64,0xC07CE9969CF4268D,0x16386cf8996669f2,1
+np.float64,0x40780EE32D5847C4,0x62a436bd1abe108d,1
+#use 18th entry
+np.float64,0x4076C3AA5E1E8DA1,0x60c62f56a5e72e24,1
+np.float64,0xC0730AFC7239B9BE,0x24758ead095cec1e,1
+np.float64,0xC085CC2B9C420DDB,0x0109cdaa2e5694c1,1
+np.float64,0x406D0765CB6D7AA4,0x54e06f8dd91bd945,1
+#use 19th entry
+np.float64,0xC082D011F3B495E7,0x09a6647661d279c2,1
+np.float64,0xC072826AF8F6AFBC,0x253acd3cd224507e,1
+np.float64,0x404EB9C4810CEA09,0x457933dbf07e8133,1
+np.float64,0x408284FBC97C58CE,0x755f6eb234aa4b98,1
+#use 20th entry
+np.float64,0x40856008CF6EDC63,0x7d9c0b3c03f4f73c,1
+np.float64,0xC077CB2E9F013B17,0x1d9b3d3a166a55db,1
+np.float64,0xC0479CA3C20AD057,0x3bad40e081555b99,1
+np.float64,0x40844CD31107332A,0x7a821d70aea478e2,1
+#use 21th entry
+np.float64,0xC07C8FCC0BFCC844,0x16ba1cc8c539d19b,1
+np.float64,0xC085C4E9A3ABA488,0x011ff675ba1a2217,1
+np.float64,0x4074D538B32966E5,0x5dfd9d78043c6ad9,1
+np.float64,0xC0630CA16902AD46,0x3231a446074cede6,1
+#use 22th entry
+np.float64,0xC06C826733D7D0B7,0x2b5f1078314d41e1,1
+np.float64,0xC0520DF55B2B907F,0x396c13a6ce8e833e,1
+np.float64,0xC080712072B0F437,0x107eae02d11d98ea,1
+np.float64,0x40528A6150E19EFB,0x469fdabda02228c5,1
+#use 23th entry
+np.float64,0xC07B1D74B6586451,0x18d1253883ae3b48,1
+np.float64,0x4045AFD7867DAEC0,0x43d7d634fc4c5d98,1
+np.float64,0xC07A08B91F9ED3E2,0x1a60973e6397fc37,1
+np.float64,0x407B3ECF0AE21C8C,0x673e03e9d98d7235,1
+#use 24th entry
+np.float64,0xC078AEB6F30CEABF,0x1c530b93ab54a1b3,1
+np.float64,0x4084495006A41672,0x7a775b6dc7e63064,1
+np.float64,0x40830B1C0EBF95DD,0x76e1e6eed77cfb89,1
+np.float64,0x407D93E8F33D8470,0x6a9adbc9e1e4f1e5,1
+#use 25th entry
+np.float64,0x4066B11A09EFD9E8,0x504dd528065c28a7,1
+np.float64,0x408545823723AEEB,0x7d504a9b1844f594,1
+np.float64,0xC068C711F2CA3362,0x2e104f3496ea118e,1
+np.float64,0x407F317FCC3CA873,0x6cf0732c9948ebf4,1
+#use 26th entry
+np.float64,0x407AFB3EBA2ED50F,0x66dc28a129c868d5,1
+np.float64,0xC075377037708ADE,0x21531a329f3d793e,1
+np.float64,0xC07C30066A1F3246,0x174448baa16ded2b,1
+np.float64,0xC06689A75DE2ABD3,0x2fad70662fae230b,1
+#use 27th entry
+np.float64,0x4081514E9FCCF1E0,0x71e673b9efd15f44,1
+np.float64,0xC0762C710AF68460,0x1ff1ed7d8947fe43,1
+np.float64,0xC0468102FF70D9C4,0x3be0c3a8ff3419a3,1
+np.float64,0xC07EA4CEEF02A83E,0x13b908f085102c61,1
+#use 28th entry
+np.float64,0xC06290B04AE823C4,0x328a83da3c2e3351,1
+np.float64,0xC0770EB1D1C395FB,0x1eab281c1f1db5fe,1
+np.float64,0xC06F5D4D838A5BAE,0x29500ea32fb474ea,1
+np.float64,0x40723B3133B54C5D,0x5a3c82c7c3a2b848,1
+#use 29th entry
+np.float64,0x4085E6454CE3B4AA,0x7f20319b9638d06a,1
+np.float64,0x408389F2A0585D4B,0x7850667c58aab3d0,1
+np.float64,0xC0382798F9C8AE69,0x3dc1c79fe8739d6d,1
+np.float64,0xC08299D827608418,0x0a4335f76cdbaeb5,1
+#use 30th entry
+np.float64,0xC06F3DED43301BF1,0x2965670ae46750a8,1
+np.float64,0xC070CAF6BDD577D9,0x27b4aa4ffdd29981,1
+np.float64,0x4078529AD4B2D9F2,0x6305c12755d5e0a6,1
+np.float64,0xC055B14E75A31B96,0x381c2eda6d111e5d,1
+#use 31th entry
+np.float64,0x407B13EE414FA931,0x6700772c7544564d,1
+np.float64,0x407EAFDE9DE3EC54,0x6c346a0e49724a3c,1
+np.float64,0xC08362F398B9530D,0x07ffeddbadf980cb,1
+np.float64,0x407E865CDD9EEB86,0x6bf866cac5e0d126,1
+#use 32th entry
+np.float64,0x407FB62DBC794C86,0x6db009f708ac62cb,1
+np.float64,0xC063D0BAA68CDDDE,0x31a3b2a51ce50430,1
+np.float64,0xC05E7706A2231394,0x34f24bead6fab5c9,1
+np.float64,0x4083E3A06FDE444E,0x79527b7a386d1937,1
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 233a0b1d6..60c9fe437 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -640,6 +640,16 @@ class TestExp:
yf = np.array(y, dtype=dt)*log2_
assert_almost_equal(np.exp(yf), xf)
+ def test_exp_strides(self):
+ np.random.seed(42)
+ strides = np.array([-4,-3,-2,-1,1,2,3,4])
+ sizes = np.arange(2,100)
+ for ii in sizes:
+ x_f64 = np.float64(np.random.uniform(low=0.01, high=709.1,size=ii))
+ y_true = np.exp(x_f64)
+ for jj in strides:
+ assert_array_almost_equal_nulp(np.exp(x_f64[::jj]), y_true[::jj], nulp=2)
+
class TestSpecialFloats:
def test_exp_values(self):
x = [np.nan, np.nan, np.inf, 0.]
@@ -652,6 +662,8 @@ class TestSpecialFloats:
with np.errstate(over='raise'):
assert_raises(FloatingPointError, np.exp, np.float32(100.))
assert_raises(FloatingPointError, np.exp, np.float32(1E19))
+ assert_raises(FloatingPointError, np.exp, np.float64(800.))
+ assert_raises(FloatingPointError, np.exp, np.float64(1E19))
def test_log_values(self):
with np.errstate(all='ignore'):
diff --git a/numpy/core/tests/test_umath_accuracy.py b/numpy/core/tests/test_umath_accuracy.py
index fd7214396..e3c2eb025 100644
--- a/numpy/core/tests/test_umath_accuracy.py
+++ b/numpy/core/tests/test_umath_accuracy.py
@@ -3,7 +3,7 @@ import platform
from os import path
import sys
import pytest
-from ctypes import c_float, c_int, cast, pointer, POINTER
+from ctypes import c_longlong, c_double, c_float, c_int, cast, pointer, POINTER
from numpy.testing import assert_array_max_ulp
from numpy.core._multiarray_umath import __cpu_features__
@@ -16,10 +16,15 @@ platform_skip = pytest.mark.skipif(not runtest,
# convert string to hex function taken from:
# https://stackoverflow.com/questions/1592158/convert-hex-to-float #
-def convert(s):
+def convert(s, datatype="np.float32"):
i = int(s, 16) # convert from hex to a Python int
- cp = pointer(c_int(i)) # make this into a c integer
- fp = cast(cp, POINTER(c_float)) # cast the int pointer to a float pointer
+ if (datatype == "np.float64"):
+ cp = pointer(c_longlong(i)) # make this into a c long long integer
+ fp = cast(cp, POINTER(c_double)) # cast the int pointer to a double pointer
+ else:
+ cp = pointer(c_int(i)) # make this into a c integer
+ fp = cast(cp, POINTER(c_float)) # cast the int pointer to a float pointer
+
return fp.contents.value # dereference the pointer, get the float
str_to_float = np.vectorize(convert)
@@ -45,8 +50,8 @@ class TestAccuracy:
npfunc = getattr(np, filename.split('-')[3])
for datatype in np.unique(data['type']):
data_subset = data[data['type'] == datatype]
- inval = np.array(str_to_float(data_subset['input'].astype(str)), dtype=eval(datatype))
- outval = np.array(str_to_float(data_subset['output'].astype(str)), dtype=eval(datatype))
+ inval = np.array(str_to_float(data_subset['input'].astype(str), data_subset['type'].astype(str)), dtype=eval(datatype))
+ outval = np.array(str_to_float(data_subset['output'].astype(str), data_subset['type'].astype(str)), dtype=eval(datatype))
perm = np.random.permutation(len(inval))
inval = inval[perm]
outval = outval[perm]