summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/code_generators/generate_umath.py8
-rw-r--r--numpy/core/include/numpy/npy_common.h8
-rw-r--r--numpy/core/include/numpy/npy_math.h26
-rw-r--r--numpy/core/setup_common.py5
-rw-r--r--numpy/core/src/umath/cpuid.c22
-rw-r--r--numpy/core/src/umath/loops.c.src33
-rw-r--r--numpy/core/src/umath/loops.h.src4
-rw-r--r--numpy/core/src/umath/simd.inc.src315
-rw-r--r--numpy/core/tests/test_umath.py41
9 files changed, 410 insertions, 52 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index bf1747272..ae871ea6f 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -662,6 +662,8 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.cos'),
None,
+ TD('e', f='cos', astype={'e':'f'}),
+ TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='cos', astype={'e':'f'}),
TD(P, f='cos'),
),
@@ -669,6 +671,8 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.sin'),
None,
+ TD('e', f='sin', astype={'e':'f'}),
+ TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='sin', astype={'e':'f'}),
TD(P, f='sin'),
),
@@ -705,7 +709,7 @@ defdict = {
docstrings.get('numpy.core.umath.exp'),
None,
TD('e', f='exp', astype={'e':'f'}),
- TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+ TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='exp', astype={'e':'f'}),
TD(P, f='exp'),
),
@@ -728,7 +732,7 @@ defdict = {
docstrings.get('numpy.core.umath.log'),
None,
TD('e', f='log', astype={'e':'f'}),
- TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+ TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='log', astype={'e':'f'}),
TD(P, f='log'),
),
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 108c0a202..27b83f7b5 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -44,10 +44,14 @@
#else
#define NPY_GCC_TARGET_AVX
#endif
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
+#define HAVE_ATTRIBUTE_TARGET_FMA
+#define NPY_GCC_TARGET_FMA __attribute__((target("avx2,fma")))
+#endif
+
#if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
-#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
#else
#define NPY_GCC_TARGET_AVX2
#endif
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 6a78ff3c2..126b861bf 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -144,7 +144,22 @@ NPY_INLINE static float __npy_nzerof(void)
#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
-
+/*
+ * Constants used in vector implementation of sinf/cosf(x)
+ */
+#define NPY_TWO_O_PIf 0x1.45f306p-1f
+#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
+#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
+#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
+#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
+#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
+#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
+#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
+#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
+#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
+#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
+#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
+#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
/*
* Integer functions.
*/
@@ -163,6 +178,15 @@ NPY_INPLACE npy_longlong npy_gcdll(npy_longlong a, npy_longlong b);
NPY_INPLACE npy_longlong npy_lcmll(npy_longlong a, npy_longlong b);
/*
+ * avx function has a common API for both sin & cos. This enum is used to
+ * distinguish between the two
+ */
+typedef enum {
+ npy_compute_sin,
+ npy_compute_cos
+} NPY_TRIG_OP;
+
+/*
* C99 double math funcs
*/
NPY_INPLACE double npy_sin(double x);
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 6e3109ab5..a3f7acd6d 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -178,9 +178,10 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
# gcc 4.8.4 support attributes but not with intrisics
# tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
# function name will be converted to HAVE_<upper-case-name> preprocessor macro
-OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))',
+OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fma")))',
'attribute_target_avx2_with_intrinsics',
- '__m256 temp = _mm256_set1_ps(1.0)',
+ '__m256 temp = _mm256_set1_ps(1.0); temp = \
+ _mm256_fmadd_ps(temp, temp, temp)',
'immintrin.h'),
('__attribute__((target("avx512f")))',
'attribute_target_avx512f_with_intrinsics',
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
index 8673f1736..72c6493e8 100644
--- a/numpy/core/src/umath/cpuid.c
+++ b/numpy/core/src/umath/cpuid.c
@@ -48,6 +48,25 @@ int os_avx512_support(void)
#endif
}
+static NPY_INLINE
+int cpu_supports_fma(void)
+{
+#ifdef __x86_64__
+ unsigned int feature = 0x01;
+ unsigned int a, b, c, d;
+ __asm__ volatile (
+ "cpuid" "\n\t"
+ : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
+ : "a" (feature));
+ /*
+ * FMA is the 12th bit of ECX
+ */
+ return (c >> 12) & 1;
+#else
+ return 0;
+#endif
+}
+
/*
* Primitive cpu feature detect function
* Currently only supports checking for avx on gcc compatible compilers.
@@ -63,6 +82,9 @@ npy_cpu_supports(const char * feature)
return 0;
#endif
}
+ else if (strcmp(feature, "fma") == 0) {
+ return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support();
+ }
else if (strcmp(feature, "avx2") == 0) {
return __builtin_cpu_supports("avx2") && os_avx_support();
}
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 1a4885133..2028a5712 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1594,8 +1594,8 @@ NPY_NO_EXPORT void
/**end repeat**/
/**begin repeat
- * #func = exp, log#
- * #scalarf = npy_expf, npy_logf#
+ * #func = sin, cos, exp, log#
+ * #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
*/
NPY_NO_EXPORT NPY_GCC_OPT_3 void
@@ -1610,8 +1610,8 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
/**end repeat**/
/**begin repeat
- * #isa = avx512f, avx2#
- * #ISA = AVX512F, AVX2#
+ * #isa = avx512f, fma#
+ * #ISA = AVX512F, FMA#
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
*/
@@ -1642,6 +1642,31 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
}
/**end repeat1**/
+
+/**begin repeat1
+ * #func = cos, sin#
+ * #enum = npy_compute_cos, npy_compute_sin#
+ * #scalarf = npy_cosf, npy_sinf#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+ if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, @enum@)) {
+ UNARY_LOOP {
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+ @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], @enum@);
+#else
+ const npy_float in1 = *(npy_float *)ip1;
+ *(npy_float *)op1 = @scalarf@(in1);
+#endif
+ }
+ }
+}
+
+/**end repeat1**/
+
+
/**end repeat**/
/**begin repeat
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 7f05a693a..5070ab38b 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -178,13 +178,13 @@ NPY_NO_EXPORT void
/**end repeat**/
/**begin repeat
- * #func = exp, log#
+ * #func = sin, cos, exp, log#
*/
NPY_NO_EXPORT void
FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
/**begin repeat1
- * #isa = avx512f, avx2#
+ * #isa = avx512f, fma#
*/
NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 9816a1da4..7aec1ff49 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -130,8 +130,9 @@ abs_ptrdiff(char *a, char *b)
*/
/**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512f#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512f#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
* #REGISTER_SIZE = 32, 64#
*/
@@ -141,7 +142,7 @@ abs_ptrdiff(char *a, char *b)
* #func = exp, log#
*/
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
#endif
@@ -149,7 +150,7 @@ static NPY_INLINE void
static NPY_INLINE int
run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
{
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
@ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
return 1;
@@ -162,6 +163,25 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
/**end repeat1**/
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
+@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, NPY_TRIG_OP);
+#endif
+
+static NPY_INLINE int
+run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, NPY_TRIG_OP my_trig_op)
+{
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+ if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
+ @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], my_trig_op);
+ return 1;
+ }
+ else
+ return 0;
+#endif
+ return 0;
+}
+
/**end repeat**/
@@ -1123,20 +1143,14 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
/* bunch of helper functions used in ISA_exp/log_FLOAT*/
#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_fmadd(__m256 a, __m256 b, __m256 c)
-{
- return _mm256_add_ps(_mm256_mul_ps(a, b), c);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_full_load_mask(void)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_full_load_mask(void)
{
return _mm256_set1_ps(-1.0);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
{
float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
@@ -1144,8 +1158,8 @@ avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
return _mm256_loadu_ps(addr);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_masked_gather(__m256 src,
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_masked_gather(__m256 src,
npy_float* addr,
__m256i vindex,
__m256 mask)
@@ -1153,26 +1167,39 @@ avx2_masked_gather(__m256 src,
return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_masked_load(__m256 mask, npy_float* addr)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_masked_load(__m256 mask, npy_float* addr)
{
return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
{
return _mm256_blendv_ps(x, val, mask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_blend(__m256 x, __m256 y, __m256 ymask)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_blend(__m256 x, __m256 y, __m256 ymask)
{
return _mm256_blendv_ps(x, y, ymask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_exponent(__m256 x)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
+{
+ return _mm256_cvtepi32_ps(
+ _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_should_negate(__m256i k, __m256i andop, __m256i cmp)
+{
+ return fma_should_calculate_sine(k, andop, cmp);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_exponent(__m256 x)
{
/*
* Special handling of denormals:
@@ -1198,8 +1225,8 @@ avx2_get_exponent(__m256 x)
return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_mantissa(__m256 x)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_mantissa(__m256 x)
{
/*
* Special handling of denormals:
@@ -1225,7 +1252,7 @@ avx2_get_mantissa(__m256 x)
}
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_scalef_ps(__m256 poly, __m256 quadrant)
+fma_scalef_ps(__m256 poly, __m256 quadrant)
{
/*
* Handle denormals (which occur when quadrant <= -125):
@@ -1305,6 +1332,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
return _mm512_mask_mov_ps(x, ymask, y);
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
+{
+ return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_should_negate(__m512i k, __m512i andop, __m512i cmp)
+{
+ return avx512_should_calculate_sine(k, andop, cmp);
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_get_exponent(__m512 x)
{
@@ -1325,17 +1364,28 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
#endif
/**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
* #vtype = __m256, __m512#
* #vsize = 256, 512#
* #or = or_ps, kor#
* #vsub = , _mask#
* #mask = __m256, __mmask16#
- * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
**/
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
+#if defined @CHK@
+
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
@isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
{
@@ -1344,12 +1394,56 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
reduced_x = @fmadd@(y, c3, reduced_x);
return reduced_x;
}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
+@isa@_in_range_mask(@vtype@ x, npy_float fmax, npy_float fmin)
+{
+ @mask@ m1 = _mm@vsize@_cmp_ps@vsub@(
+ x, _mm@vsize@_set1_ps(fmax), _CMP_GT_OQ);
+ @mask@ m2 = _mm@vsize@_cmp_ps@vsub@(
+ x, _mm@vsize@_set1_ps(fmin), _CMP_LT_OQ);
+ return _mm@vsize@_@or@(m1,m2);
+}
+
+/*
+ * Approximate cosine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.875
+ */
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4,
+ @vtype@ invf2, @vtype@ invf0)
+{
+ @vtype@ cos = @fmadd@(invf8, x2, invf6);
+ cos = @fmadd@(cos, x2, invf4);
+ cos = @fmadd@(cos, x2, invf2);
+ cos = @fmadd@(cos, x2, invf0);
+ return cos;
+}
+
+/*
+ * Approximate sine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.647
+ */
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7,
+ @vtype@ invf5, @vtype@ invf3,
+ @vtype@ zero)
+{
+ @vtype@ sin = @fmadd@(invf9, x2, invf7);
+ sin = @fmadd@(sin, x2, invf5);
+ sin = @fmadd@(sin, x2, invf3);
+ sin = @fmadd@(sin, x2, zero);
+ sin = @fmadd@(sin, x, x);
+ return sin;
+}
#endif
/**end repeat**/
/**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
* #vtype = __m256, __m512#
* #vsize = 256, 512#
* #BYTES = 32, 64#
@@ -1358,13 +1452,165 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
* #or_masks =_mm256_or_ps, _mm512_kor#
* #and_masks =_mm256_and_ps, _mm512_kand#
* #xor_masks =_mm256_xor_ps, _mm512_kxor#
- * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
* #mask_to_int = _mm256_movemask_ps, #
* #full_mask= 0xFF, 0xFFFF#
* #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
* #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+
+/*
+ * Vectorized approximate sine/cosine algorithms: The following code is a
+ * vectorized version of the algorithm presented here:
+ * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+ * (1) Load data in ZMM/YMM registers and generate mask for elements that are
+ * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
+ * 117435.992f] for sine.
+ * (2) For elements within range, perform range reduction using Cody-Waite's
+ * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
+ * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
+ * int(y).
+ * (4) For elements outside that range, Cody-Waite reduction peforms poorly
+ * leading to catastrophic cancellation. We compute cosine by calling glibc in
+ * a scalar fashion.
+ * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
+ * 5-7x faster than scalar implementations when magnitude of all elements in
+ * the array < 71476.0625f (117435.992f for sine). Worst case performance is
+ * when all the elements are large leading to about 1-2% reduction in
+ * performance.
*/
+#if defined @CHK@
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_sincos_FLOAT(npy_float * op,
+ npy_float * ip,
+ const npy_intp array_size,
+ const npy_intp steps,
+ NPY_TRIG_OP my_trig_op)
+{
+ const npy_intp stride = steps/sizeof(npy_float);
+ const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+ npy_float large_number = 71476.0625f;
+ if (my_trig_op == npy_compute_sin) {
+ large_number = 117435.992f;
+ }
+
+ /* Load up frequently used constants */
+ @vtype@i zeros = _mm@vsize@_set1_epi32(0);
+ @vtype@i ones = _mm@vsize@_set1_epi32(1);
+ @vtype@i twos = _mm@vsize@_set1_epi32(2);
+ @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf);
+ @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf);
+ @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf);
+ @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf);
+ @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf);
+ @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf);
+ @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf);
+ @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf);
+ @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf);
+ @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf);
+ @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf);
+ @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf);
+ @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf);
+ @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
+ @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f);
+ @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
+ @vtype@i iquadrant;
+ @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
+ @mask@ load_mask = @isa@_get_full_load_mask();
+ npy_intp num_remaining_elements = array_size;
+ npy_int indexarr[16];
+ for (npy_int ii = 0; ii < 16; ii++) {
+ indexarr[ii] = ii*stride;
+ }
+ @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
+
+ while (num_remaining_elements > 0) {
+
+ if (num_remaining_elements < num_lanes) {
+ load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+ num_lanes);
+ }
+
+ @vtype@ x;
+ if (stride == 1) {
+ x = @isa@_masked_load(load_mask, ip);
+ }
+ else {
+ x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
+ }
+
+ /*
+ * For elements outside of this range, Cody-Waite's range reduction
+ * becomes inaccurate and we will call glibc to compute cosine for
+ * these numbers
+ */
+
+ glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
+ glibc_mask = @and_masks@(load_mask, glibc_mask);
+ nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
+ x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask));
+ npy_int iglibc_mask = @mask_to_int@(glibc_mask);
+
+ if (iglibc_mask != @full_mask@) {
+ quadrant = _mm@vsize@_mul_ps(x, two_over_pi);
+
+ /* round to nearest */
+ quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
+ quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
+
+ /* Cody-Waite's range reduction algorithm */
+ reduced_x = @isa@_range_reduction(x, quadrant,
+ codyw_c1, codyw_c2, codyw_c3);
+ reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x);
+
+ /* compute cosine and sine */
+ cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4,
+ cos_invf2, cos_invf0);
+ sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7,
+ sin_invf5, sin_invf3, zero_f);
+
+ iquadrant = _mm@vsize@_cvtps_epi32(quadrant);
+ if (my_trig_op == npy_compute_cos) {
+ iquadrant = _mm@vsize@_add_epi32(iquadrant, ones);
+ }
+
+ /* blend sin and cos based on the quadrant */
+ sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros);
+ cos = @isa@_blend(cos, sin, sine_mask);
+
+ /* multiply by -1 for appropriate elements */
+ negate_mask = @isa@_should_negate(iquadrant, twos, twos);
+ cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
+ cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+
+ @masked_store@(op, @cvtps_epi32@(load_mask), cos);
+ }
+
+ /* process elements using glibc for large elements */
+ if (my_trig_op == npy_compute_cos) {
+ for (int ii = 0; iglibc_mask != 0; ii++) {
+ if (iglibc_mask & 0x01) {
+ op[ii] = npy_cosf(ip[ii]);
+ }
+ iglibc_mask = iglibc_mask >> 1;
+ }
+ }
+ else {
+ for (int ii = 0; iglibc_mask != 0; ii++) {
+ if (iglibc_mask & 0x01) {
+ op[ii] = npy_sinf(ip[ii]);
+ }
+ iglibc_mask = iglibc_mask >> 1;
+ }
+ }
+ ip += num_lanes*stride;
+ op += num_lanes;
+ num_remaining_elements -= num_lanes;
+ }
+}
/*
* Vectorized implementation of exp using AVX2 and AVX512:
@@ -1381,7 +1627,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
* same x = 0xc2781e37)
*/
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_exp_FLOAT(npy_float * op,
npy_float * ip,
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 91d403d98..ef48fed05 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -678,20 +678,49 @@ class TestSpecialFloats(object):
assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
assert_raises(FloatingPointError, np.log, np.float32(-1.0))
-class TestExpLogFloat32(object):
+ def test_sincos_values(self):
+ with np.errstate(all='ignore'):
+ x = [np.nan, np.nan, np.nan, np.nan]
+ y = [np.nan, -np.nan, np.inf, -np.inf]
+ for dt in ['f', 'd', 'g']:
+ xf = np.array(x, dtype=dt)
+ yf = np.array(y, dtype=dt)
+ assert_equal(np.sin(yf), xf)
+ assert_equal(np.cos(yf), xf)
+
+ with np.errstate(invalid='raise'):
+ assert_raises(FloatingPointError, np.sin, np.float32(-np.inf))
+ assert_raises(FloatingPointError, np.sin, np.float32(np.inf))
+ assert_raises(FloatingPointError, np.cos, np.float32(-np.inf))
+ assert_raises(FloatingPointError, np.cos, np.float32(np.inf))
+
+
+class TestSIMDFloat32(object):
def test_exp_float32(self):
np.random.seed(42)
x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
x_f64 = np.float64(x_f32)
- assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=2.6)
+ assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=3)
def test_log_float32(self):
np.random.seed(42)
x_f32 = np.float32(np.random.uniform(low=0.0,high=1000,size=1000000))
x_f64 = np.float64(x_f32)
- assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9)
+ assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=4)
+
+ def test_sincos_float32(self):
+ np.random.seed(42)
+ N = 1000000
+ M = np.int(N/20)
+ index = np.random.randint(low=0, high=N, size=M)
+ x_f32 = np.float32(np.random.uniform(low=-100.,high=100.,size=N))
+ # test coverage for elements > 117435.992f for which glibc is used
+ x_f32[index] = np.float32(10E+10*np.random.rand(M))
+ x_f64 = np.float64(x_f32)
+ assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=2)
+ assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=2)
- def test_strided_exp_log_float32(self):
+ def test_strided_float32(self):
np.random.seed(42)
strides = np.random.randint(low=-100, high=100, size=100)
sizes = np.random.randint(low=1, high=2000, size=100)
@@ -699,9 +728,13 @@ class TestExpLogFloat32(object):
x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
exp_true = np.exp(x_f32)
log_true = np.log(x_f32)
+ sin_true = np.sin(x_f32)
+ cos_true = np.cos(x_f32)
for jj in strides:
assert_array_almost_equal_nulp(np.exp(x_f32[::jj]), exp_true[::jj], nulp=2)
assert_array_almost_equal_nulp(np.log(x_f32[::jj]), log_true[::jj], nulp=2)
+ assert_array_almost_equal_nulp(np.sin(x_f32[::jj]), sin_true[::jj], nulp=2)
+ assert_array_almost_equal_nulp(np.cos(x_f32[::jj]), cos_true[::jj], nulp=2)
class TestLogAddExp(_FilterInvalids):
def test_logaddexp_values(self):