diff options
| author | Charles Harris <charlesr.harris@gmail.com> | 2023-02-22 19:47:35 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-02-22 19:47:35 -0500 |
| commit | 3f142646cb3f368ff8e95330044fa71c096cb631 (patch) | |
| tree | 0593c52c17309d21ea7a592616c6157e8ff7ecee /numpy/core/src/common | |
| parent | 2448636fafdebfb327f5e029f333552e7107522a (diff) | |
| parent | 4cfe854f638e805ecac70e31f40aacb582d72b8d (diff) | |
| download | numpy-3f142646cb3f368ff8e95330044fa71c096cb631.tar.gz | |
Merge pull request #23153 from seiko2plus/removes_old_cpu_dispatcher
SIMD: Get rid of attribute-based CPU dispatching
Diffstat (limited to 'numpy/core/src/common')
| -rw-r--r-- | numpy/core/src/common/simd/avx2/memory.h | 16 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/memory.h | 53 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/arithmetic.h | 3 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/conversion.h | 8 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/math.h | 2 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/memory.h | 10 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/misc.h | 28 |
7 files changed, 66 insertions, 54 deletions
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h index 64692b54c..81144a36b 100644 --- a/numpy/core/src/common/simd/avx2/memory.h +++ b/numpy/core/src/common/simd/avx2/memory.h @@ -215,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); return _mm256_blendv_epi8(vfill, payload, mask); } // fill zero to rest lanes @@ -225,7 +225,7 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_maskload_epi64((const void*)ptr, mask); + return _mm256_maskload_epi64((const long long*)ptr, mask); } //// 64-bit nlane @@ -240,7 +240,7 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); return _mm256_blendv_epi8(vfill, payload, mask); } // fill zero to rest lanes @@ -253,7 +253,7 @@ NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) assert(nlane > 0); npy_int64 m = -((npy_int64)(nlane > 1)); __m256i mask = npyv_set_s64(-1, -1, m, m); - return _mm256_maskload_epi64((const void*)ptr, mask); + return _mm256_maskload_epi64((const long long*)ptr, mask); } // fill zero to rest lanes NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, @@ -262,7 +262,7 @@ NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi); npy_int64 m = -((npy_int64)(nlane > 1)); __m256i mask = npyv_set_s64(-1, -1, m, m); - __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); return _mm256_blendv_epi8(vfill, payload, mask); } /********************************* @@ -295,7 +295,7 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8); + return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8); } // fill zero to rest lanes NPY_FINLINE npyv_s64 @@ -315,7 +315,7 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 4); + return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4); } // fill zero to rest lanes NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) @@ -361,7 +361,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - _mm256_maskstore_epi64((void*)ptr, mask, a); + _mm256_maskstore_epi64((long long*)ptr, mask, a); } //// 64-bit nlane diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h index b97f3ec2e..4c8e86a6f 100644 --- a/numpy/core/src/common/simd/sse/memory.h +++ b/numpy/core/src/common/simd/sse/memory.h @@ -244,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) return _mm_cvtsi32_si128(*ptr); case 2: return _mm_loadl_epi64((const __m128i*)ptr); - case 3:; - npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr); - #ifdef NPY_HAVE_SSE41 - return _mm_insert_epi32(a, ptr[2], 2); - #else - return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2])); - #endif + case 3: { + npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr); + #ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[2], 2); + #else + return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2])); + #endif + } default: return npyv_load_s32(ptr); } @@ -371,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) case 1: return _mm_cvtsi32_si128(ptr[0]); case 2:; - npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); -#ifdef NPY_HAVE_SSE41 - return _mm_insert_epi32(a, ptr[stride], 1); -#else - return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); -#endif // NPY_HAVE_SSE41 - case 3:; - a = _mm_cvtsi32_si128(ptr[0]); -#ifdef NPY_HAVE_SSE41 - a = _mm_insert_epi32(a, ptr[stride], 1); - a = _mm_insert_epi32(a, ptr[stride*2], 2); - return a; -#else - a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); - a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2])); - return a; -#endif // NPY_HAVE_SSE41 + { + npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); + #ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[stride], 1); + #else + return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); + #endif // NPY_HAVE_SSE41 + } + case 3: + { + npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); + #ifdef NPY_HAVE_SSE41 + a = _mm_insert_epi32(a, ptr[stride], 1); + a = _mm_insert_epi32(a, ptr[stride*2], 2); + return a; + #else + a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); + a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2])); + return a; + #endif // NPY_HAVE_SSE41 + } default: return npyv_loadn_s32(ptr, stride); } diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h index 3c13ab06c..85f4d6b26 100644 --- a/numpy/core/src/common/simd/vec/arithmetic.h +++ b/numpy/core/src/common/simd/vec/arithmetic.h @@ -366,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); return vec_extract(sum, 0) + vec_extract(sum, 1); + (void)sum; } #endif @@ -386,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) npyv_u32 four = vec_sum4s(a, zero); npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero); return (npy_uint16)vec_extract(one, 3); + (void)one; #endif } @@ -400,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) npyv_u32 four = vec_add(eight.val[0], eight.val[1]); npyv_s32 one = vec_sums((npyv_s32)four, zero); return (npy_uint32)vec_extract(one, 3); + (void)one; #endif } diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h index 0c0d5b3ac..922109f7b 100644 --- a/numpy/core/src/common/simd/vec/conversion.h +++ b/numpy/core/src/common/simd/vec/conversion.h @@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 4); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) { @@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) { @@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { @@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } #else NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h index 95b16fdf7..85690f76c 100644 --- a/numpy/core/src/common/simd/vec/math.h +++ b/numpy/core/src/common/simd/vec/math.h @@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32) { \ npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ return (npy_##STYPE)vec_extract(r, 0); \ + (void)r; \ } NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64) NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64) @@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64) { \ npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ return vec_extract(r, 0); \ + (void)r; \ } \ NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ { \ diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h index 3c17617b1..de78d02e3 100644 --- a/numpy/core/src/common/simd/vec/memory.h +++ b/numpy/core/src/common/simd/vec/memory.h @@ -46,14 +46,8 @@ #endif // avoid aliasing rules -#ifdef __cplusplus - template<typename T_PTR> - NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr) - { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } -#else - NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr) - { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } -#endif // __cplusplus +NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr) +{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } // load lower part NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr) diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h index 7ea0f21f6..79c194d90 100644 --- a/numpy/core/src/common/simd/vec/misc.h +++ b/numpy/core/src/common/simd/vec/misc.h @@ -26,28 +26,28 @@ #define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V}) #define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V}) -#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)VAL) -#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)VAL) -#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL) -#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL) -#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL) -#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL) +#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)(VAL)) +#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)(VAL)) +#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL)) +#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL)) +#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL)) +#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL)) #if NPY_SIMD_F32 - #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL) + #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL)) #endif -#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL) -#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL) +#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL)) +#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL)) #define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL) // vector with specific values set to each lane and // set a specific value to all remained lanes -#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) -#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) -#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) +#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)}) +#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)}) +#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)}) #define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) -#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) +#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)}) #define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) -#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) +#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)}) #define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) #if NPY_SIMD_F32 #define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)}) |
