summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/code_generators/generate_umath.py8
-rw-r--r--numpy/core/include/numpy/npy_common.h8
-rw-r--r--numpy/core/setup_common.py5
-rw-r--r--numpy/core/src/umath/cpuid.c22
-rw-r--r--numpy/core/src/umath/loops.c.src4
-rw-r--r--numpy/core/src/umath/loops.h.src2
-rw-r--r--numpy/core/src/umath/simd.inc.src79
7 files changed, 76 insertions, 52 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index bf31c0a88..ae871ea6f 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -663,7 +663,7 @@ defdict = {
docstrings.get('numpy.core.umath.cos'),
None,
TD('e', f='cos', astype={'e':'f'}),
- TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+ TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='cos', astype={'e':'f'}),
TD(P, f='cos'),
),
@@ -672,7 +672,7 @@ defdict = {
docstrings.get('numpy.core.umath.sin'),
None,
TD('e', f='sin', astype={'e':'f'}),
- TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+ TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='sin', astype={'e':'f'}),
TD(P, f='sin'),
),
@@ -709,7 +709,7 @@ defdict = {
docstrings.get('numpy.core.umath.exp'),
None,
TD('e', f='exp', astype={'e':'f'}),
- TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+ TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='exp', astype={'e':'f'}),
TD(P, f='exp'),
),
@@ -732,7 +732,7 @@ defdict = {
docstrings.get('numpy.core.umath.log'),
None,
TD('e', f='log', astype={'e':'f'}),
- TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+ TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='log', astype={'e':'f'}),
TD(P, f='log'),
),
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 108c0a202..27b83f7b5 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -44,10 +44,14 @@
#else
#define NPY_GCC_TARGET_AVX
#endif
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
+#define HAVE_ATTRIBUTE_TARGET_FMA
+#define NPY_GCC_TARGET_FMA __attribute__((target("avx2,fma")))
+#endif
+
#if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
-#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
#else
#define NPY_GCC_TARGET_AVX2
#endif
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 6e3109ab5..a3f7acd6d 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -178,9 +178,10 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
# gcc 4.8.4 support attributes but not with intrisics
# tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
# function name will be converted to HAVE_<upper-case-name> preprocessor macro
-OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))',
+OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fma")))',
'attribute_target_avx2_with_intrinsics',
- '__m256 temp = _mm256_set1_ps(1.0)',
+ '__m256 temp = _mm256_set1_ps(1.0); temp = \
+ _mm256_fmadd_ps(temp, temp, temp)',
'immintrin.h'),
('__attribute__((target("avx512f")))',
'attribute_target_avx512f_with_intrinsics',
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
index 8673f1736..9d4ba13d8 100644
--- a/numpy/core/src/umath/cpuid.c
+++ b/numpy/core/src/umath/cpuid.c
@@ -48,6 +48,25 @@ int os_avx512_support(void)
#endif
}
+static NPY_INLINE
+int cpu_supports_fma(void)
+{
+ unsigned int feature = 0x01;
+ unsigned int a, b, c, d;
+#ifdef __x86_64__
+ __asm__ volatile (
+ "cpuid" "\n\t"
+ : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
+ : "a" (feature));
+ /*
+ * FMA is the 12th bit of ECX
+ */
+ return (c >> 12) & 1;
+#else
+ return 0;
+#endif
+}
+
/*
* Primitive cpu feature detect function
* Currently only supports checking for avx on gcc compatible compilers.
@@ -63,6 +82,9 @@ npy_cpu_supports(const char * feature)
return 0;
#endif
}
+ else if (strcmp(feature, "fma") == 0) {
+ return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support();
+ }
else if (strcmp(feature, "avx2") == 0) {
return __builtin_cpu_supports("avx2") && os_avx_support();
}
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index f29b15477..80bd87875 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1610,8 +1610,8 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
/**end repeat**/
/**begin repeat
- * #isa = avx512f, avx2#
- * #ISA = AVX512F, AVX2#
+ * #isa = avx512f, fma#
+ * #ISA = AVX512F, FMA#
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
*/
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index accd0eab6..5070ab38b 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -184,7 +184,7 @@ NPY_NO_EXPORT void
FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
/**begin repeat1
- * #isa = avx512f, avx2#
+ * #isa = avx512f, fma#
*/
NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 6da75d724..43a0ddaad 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -130,8 +130,9 @@ abs_ptrdiff(char *a, char *b)
*/
/**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512f#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512f#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
* #REGISTER_SIZE = 32, 64#
*/
@@ -141,7 +142,7 @@ abs_ptrdiff(char *a, char *b)
* #func = exp, log#
*/
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
#endif
@@ -149,7 +150,7 @@ static NPY_INLINE void
static NPY_INLINE int
run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
{
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
@ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
return 1;
@@ -162,7 +163,7 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
/**end repeat1**/
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, char*);
#endif
@@ -170,7 +171,7 @@ static NPY_INLINE void
static NPY_INLINE int
run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, char* mychar)
{
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
@ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], mychar);
return 1;
@@ -1142,20 +1143,14 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
/* bunch of helper functions used in ISA_exp/log_FLOAT*/
#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_fmadd(__m256 a, __m256 b, __m256 c)
-{
- return _mm256_add_ps(_mm256_mul_ps(a, b), c);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_full_load_mask(void)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_full_load_mask(void)
{
return _mm256_set1_ps(-1.0);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
{
float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
@@ -1163,8 +1158,8 @@ avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
return _mm256_loadu_ps(addr);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_masked_gather(__m256 src,
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_masked_gather(__m256 src,
npy_float* addr,
__m256i vindex,
__m256 mask)
@@ -1172,39 +1167,39 @@ avx2_masked_gather(__m256 src,
return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_masked_load(__m256 mask, npy_float* addr)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_masked_load(__m256 mask, npy_float* addr)
{
return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
{
return _mm256_blendv_ps(x, val, mask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_blend(__m256 x, __m256 y, __m256 ymask)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_blend(__m256 x, __m256 y, __m256 ymask)
{
return _mm256_blendv_ps(x, y, ymask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
{
return _mm256_cvtepi32_ps(
_mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_should_negate(__m256i k, __m256i andop, __m256i cmp)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_should_negate(__m256i k, __m256i andop, __m256i cmp)
{
- return avx2_should_calculate_sine(k, andop, cmp);
+ return fma_should_calculate_sine(k, andop, cmp);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_exponent(__m256 x)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_exponent(__m256 x)
{
/*
* Special handling of denormals:
@@ -1230,8 +1225,8 @@ avx2_get_exponent(__m256 x)
return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_mantissa(__m256 x)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_mantissa(__m256 x)
{
/*
* Special handling of denormals:
@@ -1369,17 +1364,18 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
#endif
/**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
* #vtype = __m256, __m512#
* #vsize = 256, 512#
* #or = or_ps, kor#
* #vsub = , _mask#
* #mask = __m256, __mmask16#
- * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
**/
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
+#if defined @CHK@
/*
* Vectorized Cody-Waite range reduction technique
@@ -1446,8 +1442,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
/**end repeat**/
/**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
* #vtype = __m256, __m512#
* #vsize = 256, 512#
* #BYTES = 32, 64#
@@ -1456,11 +1452,12 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
* #or_masks =_mm256_or_ps, _mm512_kor#
* #and_masks =_mm256_and_ps, _mm512_kand#
* #xor_masks =_mm256_xor_ps, _mm512_kxor#
- * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
* #mask_to_int = _mm256_movemask_ps, #
* #full_mask= 0xFF, 0xFFFF#
* #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
* #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
*/
@@ -1485,7 +1482,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
* performance.
*/
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
+#if defined @CHK@
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_sincos_FLOAT(npy_float * op,
npy_float * ip,