SIMD: Force inlining all functions that accept AVX registers

To avoid spilling vector registers into the stack, which may reduce the performance and also to workaround GCC bug on WIN64.
author: Sayed Adel <seiko@imavr.com> 2021-04-18 00:30:48 +0100
committer: DWesl <22566757+DWesl@users.noreply.github.com> 2021-07-20 12:18:12 -0400
commit: 62293d45fb31f81bc7df8038a6d1c982ac05b010 (patch)
tree: 9446f64deda76b72fe60d8f52ec795a7fc298a4d /numpy
parent: 3cec3ae80e626952a6b61a72516713217d99fefe (diff)
download: numpy-62293d45fb31f81bc7df8038a6d1c982ac05b010.tar.gz
3 files changed, 97 insertions, 97 deletions
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index d8c8fdc9e..51b167844 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -565,36 +565,36 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 #endif
 
 #ifdef AVX512F_NOMSVC
-static NPY_INLINE __mmask16
+NPY_FINLINE __mmask16
 avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
 }
 
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
 avx512_get_full_load_mask_pd(void)
 {
     return 0xFF;
 }
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
 {
     return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x0001 << num_elem) - 0x0001;
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
 avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x01 << num_elem) - 0x01;
@@ -613,18 +613,18 @@ avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem
  *  #INF = NPY_INFINITYF, NPY_INFINITY#
  *  #NAN = NPY_NANF, NPY_NAN#
  */
-static @vtype@
+NPY_FINLINE @vtype@
 avx512_hadd_@vsub@(const @vtype@ x)
 {
     return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
 }
 
-static @vtype@
+NPY_FINLINE @vtype@
 avx512_hsub_@vsub@(const @vtype@ x)
 {
     return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
 }
-static NPY_INLINE @vtype@
+NPY_FINLINE @vtype@
 avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
 {
     // x1 = r1, i1
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 9970ad2ea..b17643d23 100644
--- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -45,19 +45,19 @@
 
 #ifdef SIMD_AVX2_FMA3
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_get_full_load_mask_ps(void)
 {
     return _mm256_set1_ps(-1.0);
 }
 
-static NPY_INLINE __m256i
+NPY_FINLINE __m256i
 fma_get_full_load_mask_pd(void)
 {
     return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
@@ -66,7 +66,7 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
     return _mm256_loadu_ps(addr);
 }
 
-static NPY_INLINE __m256i
+NPY_FINLINE __m256i
 fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
 {
     npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
@@ -74,7 +74,7 @@ fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
     return _mm256_loadu_si256((__m256i*) addr);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_masked_gather_ps(__m256 src,
                      npy_float* addr,
                      __m256i vindex,
@@ -83,7 +83,7 @@ fma_masked_gather_ps(__m256 src,
     return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
 }
 
-static NPY_INLINE __m256d
+NPY_FINLINE __m256d
 fma_masked_gather_pd(__m256d src,
                      npy_double* addr,
                      __m128i vindex,
@@ -92,49 +92,49 @@ fma_masked_gather_pd(__m256d src,
     return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_masked_load_ps(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
 }
 
-static NPY_INLINE __m256d
+NPY_FINLINE __m256d
 fma_masked_load_pd(__m256i mask, npy_double* addr)
 {
     return _mm256_maskload_pd(addr, mask);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
 {
     return _mm256_blendv_ps(x, val, mask);
 }
 
-static NPY_INLINE __m256d
+NPY_FINLINE __m256d
 fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
 {
     return _mm256_blendv_pd(x, val, mask);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_blend(__m256 x, __m256 y, __m256 ymask)
 {
     return _mm256_blendv_ps(x, y, ymask);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_invert_mask_ps(__m256 ymask)
 {
     return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
 }
 
-static NPY_INLINE __m256i
+NPY_FINLINE __m256i
 fma_invert_mask_pd(__m256i ymask)
 {
     return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_get_exponent(__m256 x)
 {
     /*
@@ -165,7 +165,7 @@ fma_get_exponent(__m256 x)
     return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_get_mantissa(__m256 x)
 {
     /*
@@ -195,7 +195,7 @@ fma_get_mantissa(__m256 x)
                         _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_scalef_ps(__m256 poly, __m256 quadrant)
 {
     /*
@@ -238,31 +238,31 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
 
 #ifdef SIMD_AVX512F
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
 }
 
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
 avx512_get_full_load_mask_pd(void)
 {
     return 0xFF;
 }
 
-static NPY_INLINE __mmask16
+NPY_FINLINE __mmask16
 avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x0001 << num_elem) - 0x0001;
 }
 
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
 avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x01 << num_elem) - 0x01;
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_masked_gather_ps(__m512 src,
                         npy_float* addr,
                         __m512i vindex,
@@ -271,7 +271,7 @@ avx512_masked_gather_ps(__m512 src,
     return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_masked_gather_pd(__m512d src,
                         npy_double* addr,
                         __m256i vindex,
@@ -280,67 +280,67 @@ avx512_masked_gather_pd(__m512d src,
     return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
 {
     return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
 {
     return _mm512_mask_blend_ps(mask, x, val);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
 {
     return _mm512_mask_blend_pd(mask, x, val);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 {
     return _mm512_mask_mov_ps(x, ymask, y);
 }
 
-static NPY_INLINE __mmask16
+NPY_FINLINE __mmask16
 avx512_invert_mask_ps(__mmask16 ymask)
 {
     return _mm512_knot(ymask);
 }
 
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
 avx512_invert_mask_pd(__mmask8 ymask)
 {
     return _mm512_knot(ymask);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_get_exponent(__m512 x)
 {
     return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_get_mantissa(__m512 x)
 {
     return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_scalef_ps(__m512 poly, __m512 quadrant)
 {
     return _mm512_scalef_ps(poly, quadrant);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_permute_x4var_pd(__m512d t0,
                         __m512d t1,
                         __m512d t2,
@@ -355,7 +355,7 @@ avx512_permute_x4var_pd(__m512d t0,
     return _mm512_mask_blend_pd(lut_mask, res1, res2);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
                         __m512d t4, __m512d t5, __m512d t6, __m512d t7,
                         __m512i index)
@@ -401,7 +401,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
  * 3) x* = x - y*c3
  * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
  */
-static NPY_INLINE @vtype@
+NPY_FINLINE @vtype@
 simd_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
 {
     @vtype@ reduced_x = @fmadd@(y, c1, x);
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index b535599c6..654ab81cc 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -399,7 +399,7 @@ run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp con
 * # VOP = min, max#
 */
 
-static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
+NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
 {
     npy_float r;
     __m128 tmp = _mm_movehl_ps(v, v);                   /* c     d     ... */
@@ -409,7 +409,7 @@ static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
     return r;
 }
 
-static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
+NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
 {
     npy_double r;
     __m128d tmp = _mm_unpackhi_pd(v, v);    /* b     b */
@@ -440,7 +440,7 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
  * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
  * calling convention leading to C2719 on 32 bit, see #4795
  */
-static NPY_INLINE void
+NPY_FINLINE void
 sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
                               npy_bool * op)
 {
@@ -557,7 +557,7 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
 */
 
 /* sets invalid fpu flag on QNaN for consistency with packed compare */
-static NPY_INLINE int
+NPY_FINLINE int
 sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
 {
     @vtype@ one = @vpre@_set1_@vsuf@(1);
@@ -733,19 +733,19 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 /* bunch of helper functions used in ISA_exp/log_FLOAT*/
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_get_full_load_mask_ps(void)
 {
     return _mm256_set1_ps(-1.0);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
 fma_get_full_load_mask_pd(void)
 {
     return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
@@ -754,7 +754,7 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
     return _mm256_loadu_ps(addr);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
 fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
 {
     npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
@@ -762,7 +762,7 @@ fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
     return _mm256_loadu_si256((__m256i*) addr);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_masked_gather_ps(__m256 src,
                      npy_float* addr,
                      __m256i vindex,
@@ -771,7 +771,7 @@ fma_masked_gather_ps(__m256 src,
     return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
 fma_masked_gather_pd(__m256d src,
                      npy_double* addr,
                      __m128i vindex,
@@ -780,43 +780,43 @@ fma_masked_gather_pd(__m256d src,
     return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_masked_load_ps(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
 fma_masked_load_pd(__m256i mask, npy_double* addr)
 {
     return _mm256_maskload_pd(addr, mask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
 {
     return _mm256_blendv_ps(x, val, mask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
 fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
 {
     return _mm256_blendv_pd(x, val, mask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_blend(__m256 x, __m256 y, __m256 ymask)
 {
     return _mm256_blendv_ps(x, y, ymask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_invert_mask_ps(__m256 ymask)
 {
     return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
 fma_invert_mask_pd(__m256i ymask)
 {
     return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
@@ -826,37 +826,37 @@ fma_invert_mask_pd(__m256i ymask)
  *  #vsub = ps, pd#
  *  #vtype = __m256, __m256d#
  */
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
 fma_abs_@vsub@(@vtype@ x)
 {
     return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
 fma_reciprocal_@vsub@(@vtype@ x)
 {
     return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
 fma_rint_@vsub@(@vtype@ x)
 {
     return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
 fma_floor_@vsub@(@vtype@ x)
 {
     return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
 fma_ceil_@vsub@(@vtype@ x)
 {
     return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
 fma_trunc_@vsub@(@vtype@ x)
 {
     return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
@@ -865,31 +865,31 @@ fma_trunc_@vsub@(@vtype@ x)
 #endif
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
 avx512_get_full_load_mask_pd(void)
 {
     return 0xFF;
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x0001 << num_elem) - 0x0001;
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
 avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x01 << num_elem) - 0x01;
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_masked_gather_ps(__m512 src,
                         npy_float* addr,
                         __m512i vindex,
@@ -898,7 +898,7 @@ avx512_masked_gather_ps(__m512 src,
     return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
 avx512_masked_gather_pd(__m512d src,
                         npy_double* addr,
                         __m256i vindex,
@@ -907,43 +907,43 @@ avx512_masked_gather_pd(__m512d src,
     return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
 avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
 {
     return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
 {
     return _mm512_mask_blend_ps(mask, x, val);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
 avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
 {
     return _mm512_mask_blend_pd(mask, x, val);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 {
     return _mm512_mask_mov_ps(x, ymask, y);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_invert_mask_ps(__mmask16 ymask)
 {
     return _mm512_knot(ymask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
 avx512_invert_mask_pd(__mmask8 ymask)
 {
     return _mm512_knot(ymask);
@@ -963,56 +963,56 @@ avx512_invert_mask_pd(__mmask8 ymask)
  *  #INF = NPY_INFINITYF, NPY_INFINITY#
  *  #NAN = NPY_NANF, NPY_NAN#
  */
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_abs_@vsub@(@vtype@ x)
 {
     return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
 				    _mm512_set1_@epi_vsub@ (@and_const@));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_reciprocal_@vsub@(@vtype@ x)
 {
     return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_rint_@vsub@(@vtype@ x)
 {
     return _mm512_roundscale_@vsub@(x, 0x08);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_floor_@vsub@(@vtype@ x)
 {
     return _mm512_roundscale_@vsub@(x, 0x09);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_ceil_@vsub@(@vtype@ x)
 {
     return _mm512_roundscale_@vsub@(x, 0x0A);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_trunc_@vsub@(@vtype@ x)
 {
     return _mm512_roundscale_@vsub@(x, 0x0B);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_hadd_@vsub@(const @vtype@ x)
 {
     return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_hsub_@vsub@(const @vtype@ x)
 {
     return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_cabsolute_@vsub@(const @vtype@ x1,
                         const @vtype@ x2,
                         const __m512i re_indices,
@@ -1057,7 +1057,7 @@ avx512_cabsolute_@vsub@(const @vtype@ x1,
     return _mm512_mul_@vsub@(hypot, larger);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_conjugate_@vsub@(const @vtype@ x)
 {
     /*
@@ -1070,7 +1070,7 @@ avx512_conjugate_@vsub@(const @vtype@ x)
     return _mm512_castsi512_@vsub@(res);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
 {
     // x1 = r1, i1
@@ -1083,7 +1083,7 @@ avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
     return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_csquare_@vsub@(@vtype@ x)
 {
     return avx512_cmul_@vsub@(x, x);
@@ -1106,25 +1106,25 @@ avx512_csquare_@vsub@(@vtype@ x)
 
 #if defined @CHK@
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 @isa@_sqrt_ps(@vtype@ x)
 {
     return _mm@vsize@_sqrt_ps(x);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
 @isa@_sqrt_pd(@vtype@d x)
 {
     return _mm@vsize@_sqrt_pd(x);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 @isa@_square_ps(@vtype@ x)
 {
     return _mm@vsize@_mul_ps(x,x);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
 @isa@_square_pd(@vtype@d x)
 {
     return _mm@vsize@_mul_pd(x,x);
@@ -1615,7 +1615,7 @@ AVX512F_absolute_@TYPE@(@type@ * op,
  * you never know
  */
 #if !@and@
-static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
+NPY_FINLINE @vtype@ byte_to_true(@vtype@ v)
 {
     const @vtype@ zero = @vpre@_setzero_@vsuf@();
     const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
author	Sayed Adel <seiko@imavr.com>	2021-04-18 00:30:48 +0100
committer	DWesl <22566757+DWesl@users.noreply.github.com>	2021-07-20 12:18:12 -0400
commit	62293d45fb31f81bc7df8038a6d1c982ac05b010 (patch)
tree	9446f64deda76b72fe60d8f52ec795a7fc298a4d /numpy
parent	3cec3ae80e626952a6b61a72516713217d99fefe (diff)
download	numpy-62293d45fb31f81bc7df8038a6d1c982ac05b010.tar.gz