summaryrefslogtreecommitdiff
path: root/numpy/core/src/common
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2023-02-22 19:47:35 -0500
committerGitHub <noreply@github.com>2023-02-22 19:47:35 -0500
commit3f142646cb3f368ff8e95330044fa71c096cb631 (patch)
tree0593c52c17309d21ea7a592616c6157e8ff7ecee /numpy/core/src/common
parent2448636fafdebfb327f5e029f333552e7107522a (diff)
parent4cfe854f638e805ecac70e31f40aacb582d72b8d (diff)
downloadnumpy-3f142646cb3f368ff8e95330044fa71c096cb631.tar.gz
Merge pull request #23153 from seiko2plus/removes_old_cpu_dispatcher
SIMD: Get rid of attribute-based CPU dispatching
Diffstat (limited to 'numpy/core/src/common')
-rw-r--r--numpy/core/src/common/simd/avx2/memory.h16
-rw-r--r--numpy/core/src/common/simd/sse/memory.h53
-rw-r--r--numpy/core/src/common/simd/vec/arithmetic.h3
-rw-r--r--numpy/core/src/common/simd/vec/conversion.h8
-rw-r--r--numpy/core/src/common/simd/vec/math.h2
-rw-r--r--numpy/core/src/common/simd/vec/memory.h10
-rw-r--r--numpy/core/src/common/simd/vec/misc.h28
7 files changed, 66 insertions, 54 deletions
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 64692b54c..81144a36b 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -215,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
}
// fill zero to rest lanes
@@ -225,7 +225,7 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_maskload_epi64((const void*)ptr, mask);
+ return _mm256_maskload_epi64((const long long*)ptr, mask);
}
//// 64-bit nlane
@@ -240,7 +240,7 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
}
// fill zero to rest lanes
@@ -253,7 +253,7 @@ NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
assert(nlane > 0);
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
- return _mm256_maskload_epi64((const void*)ptr, mask);
+ return _mm256_maskload_epi64((const long long*)ptr, mask);
}
// fill zero to rest lanes
NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
@@ -262,7 +262,7 @@ NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
- __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
}
/*********************************
@@ -295,7 +295,7 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+ return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
@@ -315,7 +315,7 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 4);
+ return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
@@ -361,7 +361,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- _mm256_maskstore_epi64((void*)ptr, mask, a);
+ _mm256_maskstore_epi64((long long*)ptr, mask, a);
}
//// 64-bit nlane
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index b97f3ec2e..4c8e86a6f 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -244,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
return _mm_cvtsi32_si128(*ptr);
case 2:
return _mm_loadl_epi64((const __m128i*)ptr);
- case 3:;
- npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
- #ifdef NPY_HAVE_SSE41
- return _mm_insert_epi32(a, ptr[2], 2);
- #else
- return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
- #endif
+ case 3: {
+ npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+ #ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[2], 2);
+ #else
+ return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+ #endif
+ }
default:
return npyv_load_s32(ptr);
}
@@ -371,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
case 1:
return _mm_cvtsi32_si128(ptr[0]);
case 2:;
- npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
- return _mm_insert_epi32(a, ptr[stride], 1);
-#else
- return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-#endif // NPY_HAVE_SSE41
- case 3:;
- a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
- a = _mm_insert_epi32(a, ptr[stride], 1);
- a = _mm_insert_epi32(a, ptr[stride*2], 2);
- return a;
-#else
- a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
- a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
- return a;
-#endif // NPY_HAVE_SSE41
+ {
+ npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+ #ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[stride], 1);
+ #else
+ return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+ #endif // NPY_HAVE_SSE41
+ }
+ case 3:
+ {
+ npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+ #ifdef NPY_HAVE_SSE41
+ a = _mm_insert_epi32(a, ptr[stride], 1);
+ a = _mm_insert_epi32(a, ptr[stride*2], 2);
+ return a;
+ #else
+ a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+ a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+ return a;
+ #endif // NPY_HAVE_SSE41
+ }
default:
return npyv_loadn_s32(ptr, stride);
}
diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h
index 3c13ab06c..85f4d6b26 100644
--- a/numpy/core/src/common/simd/vec/arithmetic.h
+++ b/numpy/core/src/common/simd/vec/arithmetic.h
@@ -366,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
return vec_extract(sum, 0) + vec_extract(sum, 1);
+ (void)sum;
}
#endif
@@ -386,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
npyv_u32 four = vec_sum4s(a, zero);
npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero);
return (npy_uint16)vec_extract(one, 3);
+ (void)one;
#endif
}
@@ -400,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
npyv_u32 four = vec_add(eight.val[0], eight.val[1]);
npyv_s32 one = vec_sums((npyv_s32)four, zero);
return (npy_uint32)vec_extract(one, 3);
+ (void)one;
#endif
}
diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h
index 0c0d5b3ac..922109f7b 100644
--- a/numpy/core/src/common/simd/vec/conversion.h
+++ b/numpy/core/src/common/simd/vec/conversion.h
@@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 4);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
{
@@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
{
@@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{
@@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
#else
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h
index 95b16fdf7..85690f76c 100644
--- a/numpy/core/src/common/simd/vec/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32)
{ \
npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
return (npy_##STYPE)vec_extract(r, 0); \
+ (void)r; \
}
NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64)
NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64)
@@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
{ \
npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
return vec_extract(r, 0); \
+ (void)r; \
} \
NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
{ \
diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h
index 3c17617b1..de78d02e3 100644
--- a/numpy/core/src/common/simd/vec/memory.h
+++ b/numpy/core/src/common/simd/vec/memory.h
@@ -46,14 +46,8 @@
#endif
// avoid aliasing rules
-#ifdef __cplusplus
- template<typename T_PTR>
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
- { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#else
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
- { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#endif // __cplusplus
+NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
// load lower part
NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h
index 7ea0f21f6..79c194d90 100644
--- a/numpy/core/src/common/simd/vec/misc.h
+++ b/numpy/core/src/common/simd/vec/misc.h
@@ -26,28 +26,28 @@
#define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
#define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V})
-#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)VAL)
-#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)VAL)
-#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL)
-#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL)
-#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL)
-#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL)
+#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)(VAL))
+#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)(VAL))
+#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL))
+#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL))
+#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL))
+#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL))
#if NPY_SIMD_F32
- #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL)
+ #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL))
#endif
-#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL)
-#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL)
+#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL))
+#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL))
#define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL)
// vector with specific values set to each lane and
// set a specific value to all remained lanes
-#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
+#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)})
+#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)})
+#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)})
#define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
-#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
+#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)})
#define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
-#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
+#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)})
#define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
#if NPY_SIMD_F32
#define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})