diff options
| author | Charles Harris <charlesr.harris@gmail.com> | 2023-02-22 19:47:35 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-02-22 19:47:35 -0500 |
| commit | 3f142646cb3f368ff8e95330044fa71c096cb631 (patch) | |
| tree | 0593c52c17309d21ea7a592616c6157e8ff7ecee /numpy/core/src | |
| parent | 2448636fafdebfb327f5e029f333552e7107522a (diff) | |
| parent | 4cfe854f638e805ecac70e31f40aacb582d72b8d (diff) | |
| download | numpy-3f142646cb3f368ff8e95330044fa71c096cb631.tar.gz | |
Merge pull request #23153 from seiko2plus/removes_old_cpu_dispatcher
SIMD: Get rid of attribute-based CPU dispatching
Diffstat (limited to 'numpy/core/src')
| -rw-r--r-- | numpy/core/src/common/simd/avx2/memory.h | 16 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/memory.h | 53 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/arithmetic.h | 3 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/conversion.h | 8 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/math.h | 2 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/memory.h | 10 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/misc.h | 28 | ||||
| -rw-r--r-- | numpy/core/src/npysort/mergesort.cpp | 1 | ||||
| -rw-r--r-- | numpy/core/src/umath/fast_loop_macros.h | 17 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.c.src | 241 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.h.src | 127 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_autovec.dispatch.c.src | 287 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_exponent_log.dispatch.c.src | 2 |
13 files changed, 436 insertions, 359 deletions
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h index 64692b54c..81144a36b 100644 --- a/numpy/core/src/common/simd/avx2/memory.h +++ b/numpy/core/src/common/simd/avx2/memory.h @@ -215,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); return _mm256_blendv_epi8(vfill, payload, mask); } // fill zero to rest lanes @@ -225,7 +225,7 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_maskload_epi64((const void*)ptr, mask); + return _mm256_maskload_epi64((const long long*)ptr, mask); } //// 64-bit nlane @@ -240,7 +240,7 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); return _mm256_blendv_epi8(vfill, payload, mask); } // fill zero to rest lanes @@ -253,7 +253,7 @@ NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) assert(nlane > 0); npy_int64 m = -((npy_int64)(nlane > 1)); __m256i mask = npyv_set_s64(-1, -1, m, m); - return _mm256_maskload_epi64((const void*)ptr, mask); + return _mm256_maskload_epi64((const long long*)ptr, mask); } // fill zero to rest lanes NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, @@ -262,7 +262,7 @@ NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi); npy_int64 m = -((npy_int64)(nlane > 1)); __m256i mask = npyv_set_s64(-1, -1, m, m); - __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); return _mm256_blendv_epi8(vfill, payload, mask); } /********************************* @@ -295,7 +295,7 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8); + return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8); } // fill zero to rest lanes NPY_FINLINE npyv_s64 @@ -315,7 +315,7 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 4); + return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4); } // fill zero to rest lanes NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) @@ -361,7 +361,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - _mm256_maskstore_epi64((void*)ptr, mask, a); + _mm256_maskstore_epi64((long long*)ptr, mask, a); } //// 64-bit nlane diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h index b97f3ec2e..4c8e86a6f 100644 --- a/numpy/core/src/common/simd/sse/memory.h +++ b/numpy/core/src/common/simd/sse/memory.h @@ -244,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) return _mm_cvtsi32_si128(*ptr); case 2: return _mm_loadl_epi64((const __m128i*)ptr); - case 3:; - npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr); - #ifdef NPY_HAVE_SSE41 - return _mm_insert_epi32(a, ptr[2], 2); - #else - return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2])); - #endif + case 3: { + npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr); + #ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[2], 2); + #else + return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2])); + #endif + } default: return npyv_load_s32(ptr); } @@ -371,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) case 1: return _mm_cvtsi32_si128(ptr[0]); case 2:; - npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); -#ifdef NPY_HAVE_SSE41 - return _mm_insert_epi32(a, ptr[stride], 1); -#else - return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); -#endif // NPY_HAVE_SSE41 - case 3:; - a = _mm_cvtsi32_si128(ptr[0]); -#ifdef NPY_HAVE_SSE41 - a = _mm_insert_epi32(a, ptr[stride], 1); - a = _mm_insert_epi32(a, ptr[stride*2], 2); - return a; -#else - a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); - a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2])); - return a; -#endif // NPY_HAVE_SSE41 + { + npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); + #ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[stride], 1); + #else + return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); + #endif // NPY_HAVE_SSE41 + } + case 3: + { + npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); + #ifdef NPY_HAVE_SSE41 + a = _mm_insert_epi32(a, ptr[stride], 1); + a = _mm_insert_epi32(a, ptr[stride*2], 2); + return a; + #else + a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); + a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2])); + return a; + #endif // NPY_HAVE_SSE41 + } default: return npyv_loadn_s32(ptr, stride); } diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h index 3c13ab06c..85f4d6b26 100644 --- a/numpy/core/src/common/simd/vec/arithmetic.h +++ b/numpy/core/src/common/simd/vec/arithmetic.h @@ -366,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); return vec_extract(sum, 0) + vec_extract(sum, 1); + (void)sum; } #endif @@ -386,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) npyv_u32 four = vec_sum4s(a, zero); npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero); return (npy_uint16)vec_extract(one, 3); + (void)one; #endif } @@ -400,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) npyv_u32 four = vec_add(eight.val[0], eight.val[1]); npyv_s32 one = vec_sums((npyv_s32)four, zero); return (npy_uint32)vec_extract(one, 3); + (void)one; #endif } diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h index 0c0d5b3ac..922109f7b 100644 --- a/numpy/core/src/common/simd/vec/conversion.h +++ b/numpy/core/src/common/simd/vec/conversion.h @@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 4); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) { @@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) { @@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { @@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } #else NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h index 95b16fdf7..85690f76c 100644 --- a/numpy/core/src/common/simd/vec/math.h +++ b/numpy/core/src/common/simd/vec/math.h @@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32) { \ npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ return (npy_##STYPE)vec_extract(r, 0); \ + (void)r; \ } NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64) NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64) @@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64) { \ npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ return vec_extract(r, 0); \ + (void)r; \ } \ NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ { \ diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h index 3c17617b1..de78d02e3 100644 --- a/numpy/core/src/common/simd/vec/memory.h +++ b/numpy/core/src/common/simd/vec/memory.h @@ -46,14 +46,8 @@ #endif // avoid aliasing rules -#ifdef __cplusplus - template<typename T_PTR> - NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr) - { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } -#else - NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr) - { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } -#endif // __cplusplus +NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr) +{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } // load lower part NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr) diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h index 7ea0f21f6..79c194d90 100644 --- a/numpy/core/src/common/simd/vec/misc.h +++ b/numpy/core/src/common/simd/vec/misc.h @@ -26,28 +26,28 @@ #define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V}) #define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V}) -#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)VAL) -#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)VAL) -#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL) -#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL) -#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL) -#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL) +#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)(VAL)) +#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)(VAL)) +#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL)) +#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL)) +#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL)) +#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL)) #if NPY_SIMD_F32 - #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL) + #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL)) #endif -#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL) -#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL) +#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL)) +#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL)) #define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL) // vector with specific values set to each lane and // set a specific value to all remained lanes -#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) -#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) -#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) +#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)}) +#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)}) +#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)}) #define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) -#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) +#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)}) #define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) -#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) +#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)}) #define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) #if NPY_SIMD_F32 #define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)}) diff --git a/numpy/core/src/npysort/mergesort.cpp b/numpy/core/src/npysort/mergesort.cpp index 60d89ddb7..f53feb320 100644 --- a/numpy/core/src/npysort/mergesort.cpp +++ b/numpy/core/src/npysort/mergesort.cpp @@ -171,6 +171,7 @@ amergesort_(type *v, npy_intp *tosort, npy_intp num) } /* + ***************************************************************************** ** STRING SORTS ** ***************************************************************************** diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h index cade26db4..b8c1926b2 100644 --- a/numpy/core/src/umath/fast_loop_macros.h +++ b/numpy/core/src/umath/fast_loop_macros.h @@ -12,6 +12,19 @@ #include <assert.h> +#include "simd/simd.h" + +/* + * largest simd vector size in bytes numpy supports + * it is currently a extremely large value as it is only used for memory + * overlap checks + */ +#if NPY_SIMD > 0 + // Enough for compiler unroll + #define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4 +#else + #define AUTOVEC_OVERLAP_SIZE 1024 +#endif /* * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc. * Very large step size can be as slow as processing it using scalar. The @@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b) /* condition allows compiler to optimize the generic macro */ \ if (IS_BINARY_CONT(tin, tout)) { \ if (abs_ptrdiff(args[2], args[0]) == 0 && \ - abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \ + abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \ BASE_BINARY_LOOP_INP(tin, tout, op) \ } \ else if (abs_ptrdiff(args[2], args[1]) == 0 && \ - abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \ + abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \ BASE_BINARY_LOOP_INP(tin, tout, op) \ } \ else { \ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 5684f26a5..397ebaca2 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -32,16 +32,6 @@ */ #define PW_BLOCKSIZE 128 - -/* - * largest simd vector size in bytes numpy supports - * it is currently a extremely large value as it is only used for memory - * overlap checks - */ -#ifndef NPY_MAX_SIMD_SIZE -#define NPY_MAX_SIMD_SIZE 1024 -#endif - /** Provides the various *_LOOP macros */ #include "fast_loop_macros.h" @@ -417,24 +407,6 @@ BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, } } - -/**begin repeat - * #kind = isnan, isinf, isfinite# - * #func = npy_isnan, npy_isinf, npy_isfinite# - * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* - * The (void)in; suppresses an unused variable warning raised by gcc and allows - * us to re-use this macro even though we do not depend on in - */ - UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = @val@); -} - -/**end repeat**/ - /* ***************************************************************************** ** INTEGER LOOPS @@ -466,82 +438,16 @@ NPY_NO_EXPORT void *((@type@ *)op1) = 1; } } - -NPY_NO_EXPORT void -@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = +in); -} - /**begin repeat1 - * #isa = , _avx2# - * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)# - * #ATTR = , NPY_GCC_TARGET_AVX2# - */ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in * in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = ~in); -} -#endif - -/**begin repeat2 * Arithmetic * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor# * #OP = +, -, *, &, |, ^# */ -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, - npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2); - } - else { - BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); - } -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int -@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), - char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) +NPY_NO_EXPORT NPY_GCC_OPT_3 int +@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) { char *ip1 = args[0]; char *indx = args[1]; @@ -556,86 +462,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int } return 0; } - -#endif - -/**end repeat2**/ - -/* - * Arithmetic bit shift operations. - * - * Intel hardware masks bit shift values, so large shifts wrap around - * and can produce surprising results. The special handling ensures that - * behavior is independent of compiler or hardware. - * TODO: We could implement consistent behavior for negative shifts, - * which is undefined in C. - */ - -#define INT_left_shift_needs_clear_floatstatus -#define UINT_left_shift_needs_clear_floatstatus - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); - -#ifdef @TYPE@_left_shift_needs_clear_floatstatus - // For some reason, our macOS CI sets an "invalid" flag here, but only - // for some types. - npy_clear_floatstatus_barrier((char*)dimensions); -#endif -} -#endif - -#undef INT_left_shift_needs_clear_floatstatus -#undef UINT_left_shift_needs_clear_floatstatus - -#if @CHK@ -NPY_NO_EXPORT -#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift -NPY_GCC_OPT_3 -#endif -void -@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); -} -#endif - -/**begin repeat2 - * #kind = logical_and, logical_or# - * #OP = &&, ||# - */ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* - * gcc vectorization of this is not good (PR60575) but manual integer - * vectorization is too tedious to be worthwhile - */ - BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2); -} -#endif - -/**end repeat2**/ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP { - const int t1 = !!*(@type@ *)ip1; - const int t2 = !!*(@type@ *)ip2; - *((npy_bool *)op1) = (t1 != t2); - } -} -#endif - /**end repeat1**/ NPY_NO_EXPORT void @@ -677,23 +503,6 @@ NPY_NO_EXPORT void *((@type@ *) op1) = out; } } - -/**begin repeat1 - * #kind = isnan, isinf, isfinite# - * #func = npy_isnan, npy_isinf, npy_isfinite# - * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* - * The (void)in; suppresses an unused variable warning raised by gcc and allows - * us to re-use this macro even though we do not depend on in - */ - UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@); -} -/**end repeat1**/ - /**end repeat**/ /**begin repeat @@ -701,19 +510,6 @@ NPY_NO_EXPORT void * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# * #c = ,,,l,ll# */ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in); -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); -} - /**begin repeat1 * #kind = gcd, lcm# **/ @@ -727,7 +523,6 @@ NPY_NO_EXPORT void } } /**end repeat1**/ - /**end repeat**/ /**begin repeat @@ -735,19 +530,6 @@ NPY_NO_EXPORT void * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# * #c = u,u,u,ul,ull# */ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in); -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0); -} - /**begin repeat1 * #kind = gcd, lcm# **/ @@ -761,7 +543,6 @@ NPY_NO_EXPORT void } } /**end repeat1**/ - /**end repeat**/ /* @@ -840,12 +621,6 @@ NPY_NO_EXPORT void } NPY_NO_EXPORT void -@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = NPY_FALSE); -} - -NPY_NO_EXPORT void @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { OUTPUT_LOOP { @@ -1714,7 +1489,7 @@ HALF_@kind@_indexed(void *NPY_UNUSED(context), const float v = npy_half_to_float(*(npy_half *)value); *indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v); } - return 0; + return 0; } /**end repeat**/ @@ -1974,12 +1749,6 @@ HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, v } } -NPY_NO_EXPORT NPY_GCC_OPT_3 void -HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu); -} - NPY_NO_EXPORT void HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index e393b8310..e36067822 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -68,6 +68,17 @@ NPY_NO_EXPORT void BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_autovec.dispatch.h" +#endif +/**begin repeat + * #kind = isnan, isinf, isfinite# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + /* ***************************************************************************** ** INTEGER LOOPS @@ -123,16 +134,34 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, /**end repeat1**/ /**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_autovec.dispatch.h" +#endif /**begin repeat - * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# + */ +/**begin repeat1 + * #kind = invert, logical_not, conjugate, reciprocal, square, add, + * subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, + * left_shift, right_shift, logical_and, logical_or, + * logical_xor, isnan, isinf, isfinite, + * absolute, sign# */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ +/**begin repeat + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ /**begin repeat1 * both signed and unsigned integer types * #s = , u# * #S = , U# */ - #define @S@@TYPE@_floor_divide @S@@TYPE@_divide #define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed #define @S@@TYPE@_fmax @S@@TYPE@_maximum @@ -147,49 +176,15 @@ NPY_NO_EXPORT void @S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**begin repeat2 - * #isa = , _avx2# - */ - -NPY_NO_EXPORT void -@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat3 * Arithmetic * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, * left_shift, right_shift# * #OP = +, -,*, &, |, ^, <<, >># */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - NPY_NO_EXPORT int -@S@@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); - -/**end repeat3**/ - -/**begin repeat3 - * #kind = logical_and, logical_or# - * #OP = &&, ||# - */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ +@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, + npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); -NPY_NO_EXPORT void -@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat2**/ /**begin repeat2 @@ -207,26 +202,13 @@ NPY_NO_EXPORT void @S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void @S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void @S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat2 - * #kind = isnan, isinf, isfinite# - **/ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat2**/ /**end repeat1**/ - /**end repeat**/ @@ -375,18 +357,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, /**end repeat1**/ /**end repeat**/ -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * #func = maximum, minimum# - */ -NPY_NO_EXPORT void -@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat1**/ -/**end repeat**/ - #ifndef NPY_DISABLE_OPTIMIZATION #include "loops_trigonometric.dispatch.h" #endif @@ -562,6 +532,19 @@ NPY_NO_EXPORT void /**end repeat1**/ /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_autovec.dispatch.h" +#endif +/**begin repeat + * #TYPE = HALF# + */ +/**begin repeat1 + * #kind = absolute# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ /* ***************************************************************************** ** COMPLEX LOOPS ** @@ -728,9 +711,6 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -NPY_NO_EXPORT void -@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - #define @TYPE@_isnan @TYPE@_isnat NPY_NO_EXPORT void @@ -806,6 +786,21 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const * #define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide /* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_autovec.dispatch.h" +#endif +/**begin repeat + * #TYPE = TIMEDELTA, DATETIME# + */ +/**begin repeat1 + * #kind = isinf# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ + /* ***************************************************************************** ** OBJECT LOOPS ** diff --git a/numpy/core/src/umath/loops_autovec.dispatch.c.src b/numpy/core/src/umath/loops_autovec.dispatch.c.src new file mode 100644 index 000000000..bdbfa0f86 --- /dev/null +++ b/numpy/core/src/umath/loops_autovec.dispatch.c.src @@ -0,0 +1,287 @@ +/*@targets + ** $maxopt $autovec baseline + ** sse2 avx2 + ** neon + ** vsx2 + ** vx + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/* + ***************************************************************************** + ** INTEGER LOOPS + ***************************************************************************** + */ +/* + * Arithmetic bit shift operations. + * + * Intel hardware masks bit shift values, so large shifts wrap around + * and can produce surprising results. The special handling ensures that + * behavior is independent of compiler or hardware. + * TODO: We could implement consistent behavior for negative shifts, + * which is undefined in C. + */ +#define INT_left_shift_needs_clear_floatstatus +#define UINT_left_shift_needs_clear_floatstatus + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, + * npy_long, npy_ulong, npy_longlong, npy_ulonglong# + * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double, + * npy_double, npy_double, npy_double, npy_double# + * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0# + * #c = hh,uhh,h,uh,,u,l,ul,ll,ull# + */ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_positive) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = +in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in * in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in); +} + +/**begin repeat1 + * Arithmetic + * #kind = add, subtract, multiply# + * #OP = +, -, *# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2); + } + else { + BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); + } +} +/**end repeat1**/ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); +#ifdef @TYPE@_left_shift_needs_clear_floatstatus + // For some reason, our macOS CI sets an "invalid" flag here, but only + // for some types. + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift + BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); +#else + BINARY_LOOP { + @type@ in1 = *(@type@ *)ip1; + @type@ in2 = *(@type@ *)ip2; + *(@type@ *)op1 = npy_rshift@c@(in1, in2); + } +#endif +} +/**end repeat**/ + +/* + ***************************************************************************** + ** UNSIGNED INTEGER LOOPS + ***************************************************************************** + */ +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #c = u,u,u,ul,ull# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0); +} + +/**begin repeat1 + * Arithmetic + * #kind = bitwise_and, bitwise_or, bitwise_xor# + * #OP = &, |, ^# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2); + } + else { + BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); + } +} +/**end repeat1**/ + +/**begin repeat1 + * #kind = logical_and, logical_or# + * #OP = &&, ||# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + /* + * gcc vectorization of this is not good (PR60575) but manual integer + * vectorization is too tedious to be worthwhile + */ + BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2); +} +/**end repeat1**/ + +NPY_FINLINE npy_bool @TYPE@_logical_xor_(@type@ in1, @type@ in2) +{ return (!!in1) != (!!in2); } + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP_FAST(@type@, npy_bool, *out = @TYPE@_logical_xor_(in1, in2)); +} + +/**begin repeat1 + * #kind = isnan, isinf, isfinite# + * #func = npy_isnan, npy_isinf, npy_isfinite# + * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + /* + * The (void)in; suppresses an unused variable warning raised by gcc and allows + * us to re-use this macro even though we do not depend on in + */ + UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@); +} +/**end repeat1**/ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = ~in); +} +/**end repeat**/ + +/* + ***************************************************************************** + ** SIGNED! INTEGER LOOPS + ***************************************************************************** + */ + +/**begin repeat + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# + * #c = ,,,l,ll# + */ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); +} + +/**begin repeat1 + * #kind = conjugate, invert, isnan, isinf, isfinite, + * logical_and, logical_or, logical_xor, logical_not, + * bitwise_and, bitwise_or, bitwise_xor# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) +{ + NPY_CPU_DISPATCH_CURFX(U@TYPE@_@kind@)(args, dimensions, steps, func); +} +/**end repeat1**/ +/**end repeat**/ + +/* + ***************************************************************************** + ** BOOLEAN LOOPS ** + ***************************************************************************** + */ +/**begin repeat + * #kind = isnan, isinf, isfinite# + * #func = npy_isnan, npy_isinf, npy_isfinite# + * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) +{ + NPY_CPU_DISPATCH_CURFX(UBYTE_@kind@)(args, dimensions, steps, func); +} +/**end repeat**/ + +/* + ***************************************************************************** + ** HALF-FLOAT LOOPS ** + ***************************************************************************** + */ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu); +} + +/* + ***************************************************************************** + ** DATETIME LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #type = npy_datetime, npy_timedelta# + * #TYPE = DATETIME, TIMEDELTA# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_isinf) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) +{ + NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func); +} +/**end repeat**/ diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src index 182c57b01..1fac3c150 100644 --- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src +++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src @@ -239,7 +239,7 @@ fma_scalef_ps(__m256 poly, __m256 quadrant) #ifdef SIMD_AVX512F -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +NPY_FINLINE __mmask16 avx512_get_full_load_mask_ps(void) { return 0xFFFF; |
