diff options
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/src/umath/loops_arithm_fp.dispatch.c.src | 18 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_exponent_log.dispatch.c.src | 70 | ||||
| -rw-r--r-- | numpy/core/src/umath/simd.inc.src | 106 |
3 files changed, 97 insertions, 97 deletions
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src index d8c8fdc9e..51b167844 100644 --- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src @@ -565,36 +565,36 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) #endif #ifdef AVX512F_NOMSVC -static NPY_INLINE __mmask16 +NPY_FINLINE __mmask16 avx512_get_full_load_mask_ps(void) { return 0xFFFF; } -static NPY_INLINE __mmask8 +NPY_FINLINE __mmask8 avx512_get_full_load_mask_pd(void) { return 0xFF; } -static NPY_INLINE __m512 +NPY_FINLINE __m512 avx512_masked_load_ps(__mmask16 mask, npy_float* addr) { return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); } -static NPY_INLINE __m512d +NPY_FINLINE __m512d avx512_masked_load_pd(__mmask8 mask, npy_double* addr) { return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) { return (0x0001 << num_elem) - 0x0001; } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) { return (0x01 << num_elem) - 0x01; @@ -613,18 +613,18 @@ avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem * #INF = NPY_INFINITYF, NPY_INFINITY# * #NAN = NPY_NANF, NPY_NAN# */ -static @vtype@ +NPY_FINLINE @vtype@ avx512_hadd_@vsub@(const @vtype@ x) { return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); } -static @vtype@ +NPY_FINLINE @vtype@ avx512_hsub_@vsub@(const @vtype@ x) { return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); } -static NPY_INLINE @vtype@ +NPY_FINLINE @vtype@ avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) { // x1 = r1, i1 diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src index 9970ad2ea..b17643d23 100644 --- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src +++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src @@ -45,19 +45,19 @@ #ifdef SIMD_AVX2_FMA3 -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_get_full_load_mask_ps(void) { return _mm256_set1_ps(-1.0); } -static NPY_INLINE __m256i +NPY_FINLINE __m256i fma_get_full_load_mask_pd(void) { return _mm256_castpd_si256(_mm256_set1_pd(-1.0)); } -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes) { float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, @@ -66,7 +66,7 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes) return _mm256_loadu_ps(addr); } -static NPY_INLINE __m256i +NPY_FINLINE __m256i fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes) { npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1}; @@ -74,7 +74,7 @@ fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes) return _mm256_loadu_si256((__m256i*) addr); } -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_masked_gather_ps(__m256 src, npy_float* addr, __m256i vindex, @@ -83,7 +83,7 @@ fma_masked_gather_ps(__m256 src, return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4); } -static NPY_INLINE __m256d +NPY_FINLINE __m256d fma_masked_gather_pd(__m256d src, npy_double* addr, __m128i vindex, @@ -92,49 +92,49 @@ fma_masked_gather_pd(__m256d src, return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8); } -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_masked_load_ps(__m256 mask, npy_float* addr) { return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask)); } -static NPY_INLINE __m256d +NPY_FINLINE __m256d fma_masked_load_pd(__m256i mask, npy_double* addr) { return _mm256_maskload_pd(addr, mask); } -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask) { return _mm256_blendv_ps(x, val, mask); } -static NPY_INLINE __m256d +NPY_FINLINE __m256d fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask) { return _mm256_blendv_pd(x, val, mask); } -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_blend(__m256 x, __m256 y, __m256 ymask) { return _mm256_blendv_ps(x, y, ymask); } -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_invert_mask_ps(__m256 ymask) { return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0)); } -static NPY_INLINE __m256i +NPY_FINLINE __m256i fma_invert_mask_pd(__m256i ymask) { return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF)); } -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_get_exponent(__m256 x) { /* @@ -165,7 +165,7 @@ fma_get_exponent(__m256 x) return _mm256_blendv_ps(exp, denorm_exp, denormal_mask); } -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_get_mantissa(__m256 x) { /* @@ -195,7 +195,7 @@ fma_get_mantissa(__m256 x) _mm256_castps_si256(x), mantissa_bits), exp_126_bits)); } -static NPY_INLINE __m256 +NPY_FINLINE __m256 fma_scalef_ps(__m256 poly, __m256 quadrant) { /* @@ -238,31 +238,31 @@ fma_scalef_ps(__m256 poly, __m256 quadrant) #ifdef SIMD_AVX512F -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 avx512_get_full_load_mask_ps(void) { return 0xFFFF; } -static NPY_INLINE __mmask8 +NPY_FINLINE __mmask8 avx512_get_full_load_mask_pd(void) { return 0xFF; } -static NPY_INLINE __mmask16 +NPY_FINLINE __mmask16 avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) { return (0x0001 << num_elem) - 0x0001; } -static NPY_INLINE __mmask8 +NPY_FINLINE __mmask8 avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) { return (0x01 << num_elem) - 0x01; } -static NPY_INLINE __m512 +NPY_FINLINE __m512 avx512_masked_gather_ps(__m512 src, npy_float* addr, __m512i vindex, @@ -271,7 +271,7 @@ avx512_masked_gather_ps(__m512 src, return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4); } -static NPY_INLINE __m512d +NPY_FINLINE __m512d avx512_masked_gather_pd(__m512d src, npy_double* addr, __m256i vindex, @@ -280,67 +280,67 @@ avx512_masked_gather_pd(__m512d src, return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8); } -static NPY_INLINE __m512 +NPY_FINLINE __m512 avx512_masked_load_ps(__mmask16 mask, npy_float* addr) { return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); } -static NPY_INLINE __m512d +NPY_FINLINE __m512d avx512_masked_load_pd(__mmask8 mask, npy_double* addr) { return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); } -static NPY_INLINE __m512 +NPY_FINLINE __m512 avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask) { return _mm512_mask_blend_ps(mask, x, val); } -static NPY_INLINE __m512d +NPY_FINLINE __m512d avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask) { return _mm512_mask_blend_pd(mask, x, val); } -static NPY_INLINE __m512 +NPY_FINLINE __m512 avx512_blend(__m512 x, __m512 y, __mmask16 ymask) { return _mm512_mask_mov_ps(x, ymask, y); } -static NPY_INLINE __mmask16 +NPY_FINLINE __mmask16 avx512_invert_mask_ps(__mmask16 ymask) { return _mm512_knot(ymask); } -static NPY_INLINE __mmask8 +NPY_FINLINE __mmask8 avx512_invert_mask_pd(__mmask8 ymask) { return _mm512_knot(ymask); } -static NPY_INLINE __m512 +NPY_FINLINE __m512 avx512_get_exponent(__m512 x) { return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f)); } -static NPY_INLINE __m512 +NPY_FINLINE __m512 avx512_get_mantissa(__m512 x) { return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); } -static NPY_INLINE __m512 +NPY_FINLINE __m512 avx512_scalef_ps(__m512 poly, __m512 quadrant) { return _mm512_scalef_ps(poly, quadrant); } -static NPY_INLINE __m512d +NPY_FINLINE __m512d avx512_permute_x4var_pd(__m512d t0, __m512d t1, __m512d t2, @@ -355,7 +355,7 @@ avx512_permute_x4var_pd(__m512d t0, return _mm512_mask_blend_pd(lut_mask, res1, res2); } -static NPY_INLINE __m512d +NPY_FINLINE __m512d avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3, __m512d t4, __m512d t5, __m512d t6, __m512d t7, __m512i index) @@ -401,7 +401,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3, * 3) x* = x - y*c3 * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision */ -static NPY_INLINE @vtype@ +NPY_FINLINE @vtype@ simd_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3) { @vtype@ reduced_x = @fmadd@(y, c1, x); diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index b535599c6..654ab81cc 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -399,7 +399,7 @@ run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp con * # VOP = min, max# */ -static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v) +NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v) { npy_float r; __m128 tmp = _mm_movehl_ps(v, v); /* c d ... */ @@ -409,7 +409,7 @@ static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v) return r; } -static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) +NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) { npy_double r; __m128d tmp = _mm_unpackhi_pd(v, v); /* b b */ @@ -440,7 +440,7 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the * calling convention leading to C2719 on 32 bit, see #4795 */ -static NPY_INLINE void +NPY_FINLINE void sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4, npy_bool * op) { @@ -557,7 +557,7 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n) */ /* sets invalid fpu flag on QNaN for consistency with packed compare */ -static NPY_INLINE int +NPY_FINLINE int sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b) { @vtype@ one = @vpre@_set1_@vsuf@(1); @@ -733,19 +733,19 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) /* bunch of helper functions used in ISA_exp/log_FLOAT*/ #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_get_full_load_mask_ps(void) { return _mm256_set1_ps(-1.0); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i fma_get_full_load_mask_pd(void) { return _mm256_castpd_si256(_mm256_set1_pd(-1.0)); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes) { float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, @@ -754,7 +754,7 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes) return _mm256_loadu_ps(addr); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes) { npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1}; @@ -762,7 +762,7 @@ fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes) return _mm256_loadu_si256((__m256i*) addr); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_masked_gather_ps(__m256 src, npy_float* addr, __m256i vindex, @@ -771,7 +771,7 @@ fma_masked_gather_ps(__m256 src, return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d fma_masked_gather_pd(__m256d src, npy_double* addr, __m128i vindex, @@ -780,43 +780,43 @@ fma_masked_gather_pd(__m256d src, return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_masked_load_ps(__m256 mask, npy_float* addr) { return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask)); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d fma_masked_load_pd(__m256i mask, npy_double* addr) { return _mm256_maskload_pd(addr, mask); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask) { return _mm256_blendv_ps(x, val, mask); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask) { return _mm256_blendv_pd(x, val, mask); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_blend(__m256 x, __m256 y, __m256 ymask) { return _mm256_blendv_ps(x, y, ymask); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_invert_mask_ps(__m256 ymask) { return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0)); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i fma_invert_mask_pd(__m256i ymask) { return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF)); @@ -826,37 +826,37 @@ fma_invert_mask_pd(__m256i ymask) * #vsub = ps, pd# * #vtype = __m256, __m256d# */ -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ fma_abs_@vsub@(@vtype@ x) { return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ fma_reciprocal_@vsub@(@vtype@ x) { return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ fma_rint_@vsub@(@vtype@ x) { return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ fma_floor_@vsub@(@vtype@ x) { return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ fma_ceil_@vsub@(@vtype@ x) { return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ fma_trunc_@vsub@(@vtype@ x) { return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO); @@ -865,31 +865,31 @@ fma_trunc_@vsub@(@vtype@ x) #endif #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 avx512_get_full_load_mask_ps(void) { return 0xFFFF; } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 avx512_get_full_load_mask_pd(void) { return 0xFF; } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) { return (0x0001 << num_elem) - 0x0001; } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) { return (0x01 << num_elem) - 0x01; } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_masked_gather_ps(__m512 src, npy_float* addr, __m512i vindex, @@ -898,7 +898,7 @@ avx512_masked_gather_ps(__m512 src, return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d avx512_masked_gather_pd(__m512d src, npy_double* addr, __m256i vindex, @@ -907,43 +907,43 @@ avx512_masked_gather_pd(__m512d src, return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_masked_load_ps(__mmask16 mask, npy_float* addr) { return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d avx512_masked_load_pd(__mmask8 mask, npy_double* addr) { return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask) { return _mm512_mask_blend_ps(mask, x, val); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask) { return _mm512_mask_blend_pd(mask, x, val); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_blend(__m512 x, __m512 y, __mmask16 ymask) { return _mm512_mask_mov_ps(x, ymask, y); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 avx512_invert_mask_ps(__mmask16 ymask) { return _mm512_knot(ymask); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 avx512_invert_mask_pd(__mmask8 ymask) { return _mm512_knot(ymask); @@ -963,56 +963,56 @@ avx512_invert_mask_pd(__mmask8 ymask) * #INF = NPY_INFINITYF, NPY_INFINITY# * #NAN = NPY_NANF, NPY_NAN# */ -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_abs_@vsub@(@vtype@ x) { return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x, _mm512_set1_@epi_vsub@ (@and_const@)); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_reciprocal_@vsub@(@vtype@ x) { return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_rint_@vsub@(@vtype@ x) { return _mm512_roundscale_@vsub@(x, 0x08); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_floor_@vsub@(@vtype@ x) { return _mm512_roundscale_@vsub@(x, 0x09); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_ceil_@vsub@(@vtype@ x) { return _mm512_roundscale_@vsub@(x, 0x0A); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_trunc_@vsub@(@vtype@ x) { return _mm512_roundscale_@vsub@(x, 0x0B); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_hadd_@vsub@(const @vtype@ x) { return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_hsub_@vsub@(const @vtype@ x) { return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_cabsolute_@vsub@(const @vtype@ x1, const @vtype@ x2, const __m512i re_indices, @@ -1057,7 +1057,7 @@ avx512_cabsolute_@vsub@(const @vtype@ x1, return _mm512_mul_@vsub@(hypot, larger); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_conjugate_@vsub@(const @vtype@ x) { /* @@ -1070,7 +1070,7 @@ avx512_conjugate_@vsub@(const @vtype@ x) return _mm512_castsi512_@vsub@(res); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) { // x1 = r1, i1 @@ -1083,7 +1083,7 @@ avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_csquare_@vsub@(@vtype@ x) { return avx512_cmul_@vsub@(x, x); @@ -1106,25 +1106,25 @@ avx512_csquare_@vsub@(@vtype@ x) #if defined @CHK@ -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ @isa@_sqrt_ps(@vtype@ x) { return _mm@vsize@_sqrt_ps(x); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d @isa@_sqrt_pd(@vtype@d x) { return _mm@vsize@_sqrt_pd(x); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ @isa@_square_ps(@vtype@ x) { return _mm@vsize@_mul_ps(x,x); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d +NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d @isa@_square_pd(@vtype@d x) { return _mm@vsize@_mul_pd(x,x); @@ -1615,7 +1615,7 @@ AVX512F_absolute_@TYPE@(@type@ * op, * you never know */ #if !@and@ -static NPY_INLINE @vtype@ byte_to_true(@vtype@ v) +NPY_FINLINE @vtype@ byte_to_true(@vtype@ v) { const @vtype@ zero = @vpre@_setzero_@vsuf@(); const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); |
