summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2021-04-18 00:30:48 +0100
committerDWesl <22566757+DWesl@users.noreply.github.com>2021-07-20 12:18:12 -0400
commit62293d45fb31f81bc7df8038a6d1c982ac05b010 (patch)
tree9446f64deda76b72fe60d8f52ec795a7fc298a4d /numpy
parent3cec3ae80e626952a6b61a72516713217d99fefe (diff)
downloadnumpy-62293d45fb31f81bc7df8038a6d1c982ac05b010.tar.gz
SIMD: Force inlining all functions that accept AVX registers
To avoid spilling vector registers into the stack, which may reduce the performance and also to workaround GCC bug on WIN64.
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/loops_arithm_fp.dispatch.c.src18
-rw-r--r--numpy/core/src/umath/loops_exponent_log.dispatch.c.src70
-rw-r--r--numpy/core/src/umath/simd.inc.src106
3 files changed, 97 insertions, 97 deletions
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index d8c8fdc9e..51b167844 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -565,36 +565,36 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
#endif
#ifdef AVX512F_NOMSVC
-static NPY_INLINE __mmask16
+NPY_FINLINE __mmask16
avx512_get_full_load_mask_ps(void)
{
return 0xFFFF;
}
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
avx512_get_full_load_mask_pd(void)
{
return 0xFF;
}
-static NPY_INLINE __m512
+NPY_FINLINE __m512
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
{
return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
}
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
{
return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
{
return (0x0001 << num_elem) - 0x0001;
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
{
return (0x01 << num_elem) - 0x01;
@@ -613,18 +613,18 @@ avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem
* #INF = NPY_INFINITYF, NPY_INFINITY#
* #NAN = NPY_NANF, NPY_NAN#
*/
-static @vtype@
+NPY_FINLINE @vtype@
avx512_hadd_@vsub@(const @vtype@ x)
{
return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
}
-static @vtype@
+NPY_FINLINE @vtype@
avx512_hsub_@vsub@(const @vtype@ x)
{
return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
}
-static NPY_INLINE @vtype@
+NPY_FINLINE @vtype@
avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
{
// x1 = r1, i1
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 9970ad2ea..b17643d23 100644
--- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -45,19 +45,19 @@
#ifdef SIMD_AVX2_FMA3
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_get_full_load_mask_ps(void)
{
return _mm256_set1_ps(-1.0);
}
-static NPY_INLINE __m256i
+NPY_FINLINE __m256i
fma_get_full_load_mask_pd(void)
{
return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
}
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
{
float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
@@ -66,7 +66,7 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
return _mm256_loadu_ps(addr);
}
-static NPY_INLINE __m256i
+NPY_FINLINE __m256i
fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
{
npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
@@ -74,7 +74,7 @@ fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
return _mm256_loadu_si256((__m256i*) addr);
}
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_masked_gather_ps(__m256 src,
npy_float* addr,
__m256i vindex,
@@ -83,7 +83,7 @@ fma_masked_gather_ps(__m256 src,
return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}
-static NPY_INLINE __m256d
+NPY_FINLINE __m256d
fma_masked_gather_pd(__m256d src,
npy_double* addr,
__m128i vindex,
@@ -92,49 +92,49 @@ fma_masked_gather_pd(__m256d src,
return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
}
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_masked_load_ps(__m256 mask, npy_float* addr)
{
return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
}
-static NPY_INLINE __m256d
+NPY_FINLINE __m256d
fma_masked_load_pd(__m256i mask, npy_double* addr)
{
return _mm256_maskload_pd(addr, mask);
}
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
{
return _mm256_blendv_ps(x, val, mask);
}
-static NPY_INLINE __m256d
+NPY_FINLINE __m256d
fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
{
return _mm256_blendv_pd(x, val, mask);
}
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_blend(__m256 x, __m256 y, __m256 ymask)
{
return _mm256_blendv_ps(x, y, ymask);
}
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_invert_mask_ps(__m256 ymask)
{
return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
}
-static NPY_INLINE __m256i
+NPY_FINLINE __m256i
fma_invert_mask_pd(__m256i ymask)
{
return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
}
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_get_exponent(__m256 x)
{
/*
@@ -165,7 +165,7 @@ fma_get_exponent(__m256 x)
return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
}
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_get_mantissa(__m256 x)
{
/*
@@ -195,7 +195,7 @@ fma_get_mantissa(__m256 x)
_mm256_castps_si256(x), mantissa_bits), exp_126_bits));
}
-static NPY_INLINE __m256
+NPY_FINLINE __m256
fma_scalef_ps(__m256 poly, __m256 quadrant)
{
/*
@@ -238,31 +238,31 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
#ifdef SIMD_AVX512F
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_get_full_load_mask_ps(void)
{
return 0xFFFF;
}
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
avx512_get_full_load_mask_pd(void)
{
return 0xFF;
}
-static NPY_INLINE __mmask16
+NPY_FINLINE __mmask16
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
{
return (0x0001 << num_elem) - 0x0001;
}
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
{
return (0x01 << num_elem) - 0x01;
}
-static NPY_INLINE __m512
+NPY_FINLINE __m512
avx512_masked_gather_ps(__m512 src,
npy_float* addr,
__m512i vindex,
@@ -271,7 +271,7 @@ avx512_masked_gather_ps(__m512 src,
return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
}
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
avx512_masked_gather_pd(__m512d src,
npy_double* addr,
__m256i vindex,
@@ -280,67 +280,67 @@ avx512_masked_gather_pd(__m512d src,
return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
}
-static NPY_INLINE __m512
+NPY_FINLINE __m512
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
{
return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
}
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
{
return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
}
-static NPY_INLINE __m512
+NPY_FINLINE __m512
avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
{
return _mm512_mask_blend_ps(mask, x, val);
}
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
{
return _mm512_mask_blend_pd(mask, x, val);
}
-static NPY_INLINE __m512
+NPY_FINLINE __m512
avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
{
return _mm512_mask_mov_ps(x, ymask, y);
}
-static NPY_INLINE __mmask16
+NPY_FINLINE __mmask16
avx512_invert_mask_ps(__mmask16 ymask)
{
return _mm512_knot(ymask);
}
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
avx512_invert_mask_pd(__mmask8 ymask)
{
return _mm512_knot(ymask);
}
-static NPY_INLINE __m512
+NPY_FINLINE __m512
avx512_get_exponent(__m512 x)
{
return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
}
-static NPY_INLINE __m512
+NPY_FINLINE __m512
avx512_get_mantissa(__m512 x)
{
return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
}
-static NPY_INLINE __m512
+NPY_FINLINE __m512
avx512_scalef_ps(__m512 poly, __m512 quadrant)
{
return _mm512_scalef_ps(poly, quadrant);
}
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
avx512_permute_x4var_pd(__m512d t0,
__m512d t1,
__m512d t2,
@@ -355,7 +355,7 @@ avx512_permute_x4var_pd(__m512d t0,
return _mm512_mask_blend_pd(lut_mask, res1, res2);
}
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
__m512d t4, __m512d t5, __m512d t6, __m512d t7,
__m512i index)
@@ -401,7 +401,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
* 3) x* = x - y*c3
* c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
*/
-static NPY_INLINE @vtype@
+NPY_FINLINE @vtype@
simd_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
{
@vtype@ reduced_x = @fmadd@(y, c1, x);
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index b535599c6..654ab81cc 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -399,7 +399,7 @@ run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp con
* # VOP = min, max#
*/
-static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
+NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
{
npy_float r;
__m128 tmp = _mm_movehl_ps(v, v); /* c d ... */
@@ -409,7 +409,7 @@ static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
return r;
}
-static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
+NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
{
npy_double r;
__m128d tmp = _mm_unpackhi_pd(v, v); /* b b */
@@ -440,7 +440,7 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
* the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
* calling convention leading to C2719 on 32 bit, see #4795
*/
-static NPY_INLINE void
+NPY_FINLINE void
sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
npy_bool * op)
{
@@ -557,7 +557,7 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
*/
/* sets invalid fpu flag on QNaN for consistency with packed compare */
-static NPY_INLINE int
+NPY_FINLINE int
sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
{
@vtype@ one = @vpre@_set1_@vsuf@(1);
@@ -733,19 +733,19 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
/* bunch of helper functions used in ISA_exp/log_FLOAT*/
#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_full_load_mask_ps(void)
{
return _mm256_set1_ps(-1.0);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
fma_get_full_load_mask_pd(void)
{
return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
{
float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
@@ -754,7 +754,7 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
return _mm256_loadu_ps(addr);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
{
npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
@@ -762,7 +762,7 @@ fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
return _mm256_loadu_si256((__m256i*) addr);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_masked_gather_ps(__m256 src,
npy_float* addr,
__m256i vindex,
@@ -771,7 +771,7 @@ fma_masked_gather_ps(__m256 src,
return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
fma_masked_gather_pd(__m256d src,
npy_double* addr,
__m128i vindex,
@@ -780,43 +780,43 @@ fma_masked_gather_pd(__m256d src,
return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_masked_load_ps(__m256 mask, npy_float* addr)
{
return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
fma_masked_load_pd(__m256i mask, npy_double* addr)
{
return _mm256_maskload_pd(addr, mask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
{
return _mm256_blendv_ps(x, val, mask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
{
return _mm256_blendv_pd(x, val, mask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_blend(__m256 x, __m256 y, __m256 ymask)
{
return _mm256_blendv_ps(x, y, ymask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_invert_mask_ps(__m256 ymask)
{
return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
fma_invert_mask_pd(__m256i ymask)
{
return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
@@ -826,37 +826,37 @@ fma_invert_mask_pd(__m256i ymask)
* #vsub = ps, pd#
* #vtype = __m256, __m256d#
*/
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_abs_@vsub@(@vtype@ x)
{
return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_reciprocal_@vsub@(@vtype@ x)
{
return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_rint_@vsub@(@vtype@ x)
{
return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_floor_@vsub@(@vtype@ x)
{
return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_ceil_@vsub@(@vtype@ x)
{
return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_trunc_@vsub@(@vtype@ x)
{
return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
@@ -865,31 +865,31 @@ fma_trunc_@vsub@(@vtype@ x)
#endif
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_get_full_load_mask_ps(void)
{
return 0xFFFF;
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
avx512_get_full_load_mask_pd(void)
{
return 0xFF;
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
{
return (0x0001 << num_elem) - 0x0001;
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
{
return (0x01 << num_elem) - 0x01;
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_masked_gather_ps(__m512 src,
npy_float* addr,
__m512i vindex,
@@ -898,7 +898,7 @@ avx512_masked_gather_ps(__m512 src,
return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
avx512_masked_gather_pd(__m512d src,
npy_double* addr,
__m256i vindex,
@@ -907,43 +907,43 @@ avx512_masked_gather_pd(__m512d src,
return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
{
return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
{
return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
{
return _mm512_mask_blend_ps(mask, x, val);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
{
return _mm512_mask_blend_pd(mask, x, val);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
{
return _mm512_mask_mov_ps(x, ymask, y);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_invert_mask_ps(__mmask16 ymask)
{
return _mm512_knot(ymask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
avx512_invert_mask_pd(__mmask8 ymask)
{
return _mm512_knot(ymask);
@@ -963,56 +963,56 @@ avx512_invert_mask_pd(__mmask8 ymask)
* #INF = NPY_INFINITYF, NPY_INFINITY#
* #NAN = NPY_NANF, NPY_NAN#
*/
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_abs_@vsub@(@vtype@ x)
{
return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
_mm512_set1_@epi_vsub@ (@and_const@));
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_reciprocal_@vsub@(@vtype@ x)
{
return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_rint_@vsub@(@vtype@ x)
{
return _mm512_roundscale_@vsub@(x, 0x08);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_floor_@vsub@(@vtype@ x)
{
return _mm512_roundscale_@vsub@(x, 0x09);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_ceil_@vsub@(@vtype@ x)
{
return _mm512_roundscale_@vsub@(x, 0x0A);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_trunc_@vsub@(@vtype@ x)
{
return _mm512_roundscale_@vsub@(x, 0x0B);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_hadd_@vsub@(const @vtype@ x)
{
return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_hsub_@vsub@(const @vtype@ x)
{
return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_cabsolute_@vsub@(const @vtype@ x1,
const @vtype@ x2,
const __m512i re_indices,
@@ -1057,7 +1057,7 @@ avx512_cabsolute_@vsub@(const @vtype@ x1,
return _mm512_mul_@vsub@(hypot, larger);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_conjugate_@vsub@(const @vtype@ x)
{
/*
@@ -1070,7 +1070,7 @@ avx512_conjugate_@vsub@(const @vtype@ x)
return _mm512_castsi512_@vsub@(res);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
{
// x1 = r1, i1
@@ -1083,7 +1083,7 @@ avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_csquare_@vsub@(@vtype@ x)
{
return avx512_cmul_@vsub@(x, x);
@@ -1106,25 +1106,25 @@ avx512_csquare_@vsub@(@vtype@ x)
#if defined @CHK@
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
@isa@_sqrt_ps(@vtype@ x)
{
return _mm@vsize@_sqrt_ps(x);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
@isa@_sqrt_pd(@vtype@d x)
{
return _mm@vsize@_sqrt_pd(x);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
@isa@_square_ps(@vtype@ x)
{
return _mm@vsize@_mul_ps(x,x);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
@isa@_square_pd(@vtype@d x)
{
return _mm@vsize@_mul_pd(x,x);
@@ -1615,7 +1615,7 @@ AVX512F_absolute_@TYPE@(@type@ * op,
* you never know
*/
#if !@and@
-static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
+NPY_FINLINE @vtype@ byte_to_true(@vtype@ v)
{
const @vtype@ zero = @vpre@_setzero_@vsuf@();
const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);