summaryrefslogtreecommitdiff
path: root/numpy/core/src
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2023-02-22 19:47:35 -0500
committerGitHub <noreply@github.com>2023-02-22 19:47:35 -0500
commit3f142646cb3f368ff8e95330044fa71c096cb631 (patch)
tree0593c52c17309d21ea7a592616c6157e8ff7ecee /numpy/core/src
parent2448636fafdebfb327f5e029f333552e7107522a (diff)
parent4cfe854f638e805ecac70e31f40aacb582d72b8d (diff)
downloadnumpy-3f142646cb3f368ff8e95330044fa71c096cb631.tar.gz
Merge pull request #23153 from seiko2plus/removes_old_cpu_dispatcher
SIMD: Get rid of attribute-based CPU dispatching
Diffstat (limited to 'numpy/core/src')
-rw-r--r--numpy/core/src/common/simd/avx2/memory.h16
-rw-r--r--numpy/core/src/common/simd/sse/memory.h53
-rw-r--r--numpy/core/src/common/simd/vec/arithmetic.h3
-rw-r--r--numpy/core/src/common/simd/vec/conversion.h8
-rw-r--r--numpy/core/src/common/simd/vec/math.h2
-rw-r--r--numpy/core/src/common/simd/vec/memory.h10
-rw-r--r--numpy/core/src/common/simd/vec/misc.h28
-rw-r--r--numpy/core/src/npysort/mergesort.cpp1
-rw-r--r--numpy/core/src/umath/fast_loop_macros.h17
-rw-r--r--numpy/core/src/umath/loops.c.src241
-rw-r--r--numpy/core/src/umath/loops.h.src127
-rw-r--r--numpy/core/src/umath/loops_autovec.dispatch.c.src287
-rw-r--r--numpy/core/src/umath/loops_exponent_log.dispatch.c.src2
13 files changed, 436 insertions, 359 deletions
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 64692b54c..81144a36b 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -215,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
}
// fill zero to rest lanes
@@ -225,7 +225,7 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_maskload_epi64((const void*)ptr, mask);
+ return _mm256_maskload_epi64((const long long*)ptr, mask);
}
//// 64-bit nlane
@@ -240,7 +240,7 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
}
// fill zero to rest lanes
@@ -253,7 +253,7 @@ NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
assert(nlane > 0);
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
- return _mm256_maskload_epi64((const void*)ptr, mask);
+ return _mm256_maskload_epi64((const long long*)ptr, mask);
}
// fill zero to rest lanes
NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
@@ -262,7 +262,7 @@ NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
- __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
}
/*********************************
@@ -295,7 +295,7 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+ return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
@@ -315,7 +315,7 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 4);
+ return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
@@ -361,7 +361,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- _mm256_maskstore_epi64((void*)ptr, mask, a);
+ _mm256_maskstore_epi64((long long*)ptr, mask, a);
}
//// 64-bit nlane
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index b97f3ec2e..4c8e86a6f 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -244,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
return _mm_cvtsi32_si128(*ptr);
case 2:
return _mm_loadl_epi64((const __m128i*)ptr);
- case 3:;
- npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
- #ifdef NPY_HAVE_SSE41
- return _mm_insert_epi32(a, ptr[2], 2);
- #else
- return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
- #endif
+ case 3: {
+ npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+ #ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[2], 2);
+ #else
+ return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+ #endif
+ }
default:
return npyv_load_s32(ptr);
}
@@ -371,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
case 1:
return _mm_cvtsi32_si128(ptr[0]);
case 2:;
- npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
- return _mm_insert_epi32(a, ptr[stride], 1);
-#else
- return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-#endif // NPY_HAVE_SSE41
- case 3:;
- a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
- a = _mm_insert_epi32(a, ptr[stride], 1);
- a = _mm_insert_epi32(a, ptr[stride*2], 2);
- return a;
-#else
- a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
- a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
- return a;
-#endif // NPY_HAVE_SSE41
+ {
+ npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+ #ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[stride], 1);
+ #else
+ return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+ #endif // NPY_HAVE_SSE41
+ }
+ case 3:
+ {
+ npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+ #ifdef NPY_HAVE_SSE41
+ a = _mm_insert_epi32(a, ptr[stride], 1);
+ a = _mm_insert_epi32(a, ptr[stride*2], 2);
+ return a;
+ #else
+ a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+ a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+ return a;
+ #endif // NPY_HAVE_SSE41
+ }
default:
return npyv_loadn_s32(ptr, stride);
}
diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h
index 3c13ab06c..85f4d6b26 100644
--- a/numpy/core/src/common/simd/vec/arithmetic.h
+++ b/numpy/core/src/common/simd/vec/arithmetic.h
@@ -366,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
return vec_extract(sum, 0) + vec_extract(sum, 1);
+ (void)sum;
}
#endif
@@ -386,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
npyv_u32 four = vec_sum4s(a, zero);
npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero);
return (npy_uint16)vec_extract(one, 3);
+ (void)one;
#endif
}
@@ -400,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
npyv_u32 four = vec_add(eight.val[0], eight.val[1]);
npyv_s32 one = vec_sums((npyv_s32)four, zero);
return (npy_uint32)vec_extract(one, 3);
+ (void)one;
#endif
}
diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h
index 0c0d5b3ac..922109f7b 100644
--- a/numpy/core/src/common/simd/vec/conversion.h
+++ b/numpy/core/src/common/simd/vec/conversion.h
@@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 4);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
{
@@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
{
@@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{
@@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
#else
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h
index 95b16fdf7..85690f76c 100644
--- a/numpy/core/src/common/simd/vec/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32)
{ \
npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
return (npy_##STYPE)vec_extract(r, 0); \
+ (void)r; \
}
NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64)
NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64)
@@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
{ \
npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
return vec_extract(r, 0); \
+ (void)r; \
} \
NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
{ \
diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h
index 3c17617b1..de78d02e3 100644
--- a/numpy/core/src/common/simd/vec/memory.h
+++ b/numpy/core/src/common/simd/vec/memory.h
@@ -46,14 +46,8 @@
#endif
// avoid aliasing rules
-#ifdef __cplusplus
- template<typename T_PTR>
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
- { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#else
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
- { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#endif // __cplusplus
+NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
// load lower part
NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h
index 7ea0f21f6..79c194d90 100644
--- a/numpy/core/src/common/simd/vec/misc.h
+++ b/numpy/core/src/common/simd/vec/misc.h
@@ -26,28 +26,28 @@
#define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
#define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V})
-#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)VAL)
-#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)VAL)
-#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL)
-#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL)
-#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL)
-#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL)
+#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)(VAL))
+#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)(VAL))
+#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL))
+#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL))
+#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL))
+#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL))
#if NPY_SIMD_F32
- #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL)
+ #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL))
#endif
-#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL)
-#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL)
+#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL))
+#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL))
#define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL)
// vector with specific values set to each lane and
// set a specific value to all remained lanes
-#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
+#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)})
+#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)})
+#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)})
#define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
-#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
+#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)})
#define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
-#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
+#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)})
#define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
#if NPY_SIMD_F32
#define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})
diff --git a/numpy/core/src/npysort/mergesort.cpp b/numpy/core/src/npysort/mergesort.cpp
index 60d89ddb7..f53feb320 100644
--- a/numpy/core/src/npysort/mergesort.cpp
+++ b/numpy/core/src/npysort/mergesort.cpp
@@ -171,6 +171,7 @@ amergesort_(type *v, npy_intp *tosort, npy_intp num)
}
/*
+
*****************************************************************************
** STRING SORTS **
*****************************************************************************
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index cade26db4..b8c1926b2 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -12,6 +12,19 @@
#include <assert.h>
+#include "simd/simd.h"
+
+/*
+ * largest simd vector size in bytes numpy supports
+ * it is currently a extremely large value as it is only used for memory
+ * overlap checks
+ */
+#if NPY_SIMD > 0
+ // Enough for compiler unroll
+ #define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4
+#else
+ #define AUTOVEC_OVERLAP_SIZE 1024
+#endif
/*
* MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
* Very large step size can be as slow as processing it using scalar. The
@@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b)
/* condition allows compiler to optimize the generic macro */ \
if (IS_BINARY_CONT(tin, tout)) { \
if (abs_ptrdiff(args[2], args[0]) == 0 && \
- abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
+ abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else if (abs_ptrdiff(args[2], args[1]) == 0 && \
- abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
+ abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else { \
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5684f26a5..397ebaca2 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -32,16 +32,6 @@
*/
#define PW_BLOCKSIZE 128
-
-/*
- * largest simd vector size in bytes numpy supports
- * it is currently a extremely large value as it is only used for memory
- * overlap checks
- */
-#ifndef NPY_MAX_SIMD_SIZE
-#define NPY_MAX_SIMD_SIZE 1024
-#endif
-
/** Provides the various *_LOOP macros */
#include "fast_loop_macros.h"
@@ -417,24 +407,6 @@ BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps,
}
}
-
-/**begin repeat
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- /*
- * The (void)in; suppresses an unused variable warning raised by gcc and allows
- * us to re-use this macro even though we do not depend on in
- */
- UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = @val@);
-}
-
-/**end repeat**/
-
/*
*****************************************************************************
** INTEGER LOOPS
@@ -466,82 +438,16 @@ NPY_NO_EXPORT void
*((@type@ *)op1) = 1;
}
}
-
-NPY_NO_EXPORT void
-@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = +in);
-}
-
/**begin repeat1
- * #isa = , _avx2#
- * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
- * #ATTR = , NPY_GCC_TARGET_AVX2#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
-}
-#endif
-
-/**begin repeat2
* Arithmetic
* #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
* #OP = +, -, *, &, |, ^#
*/
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions,
- npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if (IS_BINARY_REDUCE) {
- BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
- }
- else {
- BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
- }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
-@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
- char **args, npy_intp const *dimensions, npy_intp const *steps,
- void *NPY_UNUSED(func))
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
{
char *ip1 = args[0];
char *indx = args[1];
@@ -556,86 +462,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
}
return 0;
}
-
-#endif
-
-/**end repeat2**/
-
-/*
- * Arithmetic bit shift operations.
- *
- * Intel hardware masks bit shift values, so large shifts wrap around
- * and can produce surprising results. The special handling ensures that
- * behavior is independent of compiler or hardware.
- * TODO: We could implement consistent behavior for negative shifts,
- * which is undefined in C.
- */
-
-#define INT_left_shift_needs_clear_floatstatus
-#define UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
- void *NPY_UNUSED(func))
-{
- BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
-
-#ifdef @TYPE@_left_shift_needs_clear_floatstatus
- // For some reason, our macOS CI sets an "invalid" flag here, but only
- // for some types.
- npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-#endif
-
-#undef INT_left_shift_needs_clear_floatstatus
-#undef UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT
-#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
-NPY_GCC_OPT_3
-#endif
-void
-@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
- void *NPY_UNUSED(func))
-{
- BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
-}
-#endif
-
-/**begin repeat2
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- /*
- * gcc vectorization of this is not good (PR60575) but manual integer
- * vectorization is too tedious to be worthwhile
- */
- BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
-}
-#endif
-
-/**end repeat2**/
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- BINARY_LOOP {
- const int t1 = !!*(@type@ *)ip1;
- const int t2 = !!*(@type@ *)ip2;
- *((npy_bool *)op1) = (t1 != t2);
- }
-}
-#endif
-
/**end repeat1**/
NPY_NO_EXPORT void
@@ -677,23 +503,6 @@ NPY_NO_EXPORT void
*((@type@ *) op1) = out;
}
}
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- /*
- * The (void)in; suppresses an unused variable warning raised by gcc and allows
- * us to re-use this macro even though we do not depend on in
- */
- UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
-}
-/**end repeat1**/
-
/**end repeat**/
/**begin repeat
@@ -701,19 +510,6 @@ NPY_NO_EXPORT void
* #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
* #c = ,,,l,ll#
*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
-}
-
/**begin repeat1
* #kind = gcd, lcm#
**/
@@ -727,7 +523,6 @@ NPY_NO_EXPORT void
}
}
/**end repeat1**/
-
/**end repeat**/
/**begin repeat
@@ -735,19 +530,6 @@ NPY_NO_EXPORT void
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
* #c = u,u,u,ul,ull#
*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
-}
-
/**begin repeat1
* #kind = gcd, lcm#
**/
@@ -761,7 +543,6 @@ NPY_NO_EXPORT void
}
}
/**end repeat1**/
-
/**end repeat**/
/*
@@ -840,12 +621,6 @@ NPY_NO_EXPORT void
}
NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = NPY_FALSE);
-}
-
-NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
OUTPUT_LOOP {
@@ -1714,7 +1489,7 @@ HALF_@kind@_indexed(void *NPY_UNUSED(context),
const float v = npy_half_to_float(*(npy_half *)value);
*indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v);
}
- return 0;
+ return 0;
}
/**end repeat**/
@@ -1974,12 +1749,6 @@ HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, v
}
}
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
-}
-
NPY_NO_EXPORT void
HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index e393b8310..e36067822 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -68,6 +68,17 @@ NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
/*
*****************************************************************************
** INTEGER LOOPS
@@ -123,16 +134,34 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
/**end repeat1**/
/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_autovec.dispatch.h"
+#endif
/**begin repeat
- * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+/**begin repeat1
+ * #kind = invert, logical_not, conjugate, reciprocal, square, add,
+ * subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ * left_shift, right_shift, logical_and, logical_or,
+ * logical_xor, isnan, isinf, isfinite,
+ * absolute, sign#
*/
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
/**begin repeat1
* both signed and unsigned integer types
* #s = , u#
* #S = , U#
*/
-
#define @S@@TYPE@_floor_divide @S@@TYPE@_divide
#define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed
#define @S@@TYPE@_fmax @S@@TYPE@_maximum
@@ -147,49 +176,15 @@ NPY_NO_EXPORT void
@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**begin repeat2
- * #isa = , _avx2#
- */
-
-NPY_NO_EXPORT void
-@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat3
* Arithmetic
* #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
* left_shift, right_shift#
* #OP = +, -,*, &, |, ^, <<, >>#
*/
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
NPY_NO_EXPORT int
-@S@@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
-
-/**end repeat3**/
-
-/**begin repeat3
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat3**/
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+ npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat2**/
/**begin repeat2
@@ -207,26 +202,13 @@ NPY_NO_EXPORT void
@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
-@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat2
- * #kind = isnan, isinf, isfinite#
- **/
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat2**/
/**end repeat1**/
-
/**end repeat**/
@@ -375,18 +357,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
/**end repeat1**/
/**end repeat**/
-/**begin repeat
- * #TYPE = FLOAT, DOUBLE#
- */
-/**begin repeat1
- * #func = maximum, minimum#
- */
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat1**/
-/**end repeat**/
-
#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_trigonometric.dispatch.h"
#endif
@@ -562,6 +532,19 @@ NPY_NO_EXPORT void
/**end repeat1**/
/**end repeat**/
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = HALF#
+ */
+/**begin repeat1
+ * #kind = absolute#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
/*
*****************************************************************************
** COMPLEX LOOPS **
@@ -728,9 +711,6 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
#define @TYPE@_isnan @TYPE@_isnat
NPY_NO_EXPORT void
@@ -806,6 +786,21 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = TIMEDELTA, DATETIME#
+ */
+/**begin repeat1
+ * #kind = isinf#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
/*
*****************************************************************************
** OBJECT LOOPS **
diff --git a/numpy/core/src/umath/loops_autovec.dispatch.c.src b/numpy/core/src/umath/loops_autovec.dispatch.c.src
new file mode 100644
index 000000000..bdbfa0f86
--- /dev/null
+++ b/numpy/core/src/umath/loops_autovec.dispatch.c.src
@@ -0,0 +1,287 @@
+/*@targets
+ ** $maxopt $autovec baseline
+ ** sse2 avx2
+ ** neon
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*
+ *****************************************************************************
+ ** INTEGER LOOPS
+ *****************************************************************************
+ */
+/*
+ * Arithmetic bit shift operations.
+ *
+ * Intel hardware masks bit shift values, so large shifts wrap around
+ * and can produce surprising results. The special handling ensures that
+ * behavior is independent of compiler or hardware.
+ * TODO: We could implement consistent behavior for negative shifts,
+ * which is undefined in C.
+ */
+#define INT_left_shift_needs_clear_floatstatus
+#define UINT_left_shift_needs_clear_floatstatus
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ * LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ * npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
+ * npy_double, npy_double, npy_double, npy_double#
+ * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = add, subtract, multiply#
+ * #OP = +, -, *#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if (IS_BINARY_REDUCE) {
+ BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+ }
+ else {
+ BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+ }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
+{
+ BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+ // For some reason, our macOS CI sets an "invalid" flag here, but only
+ // for some types.
+ npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+ BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+#else
+ BINARY_LOOP {
+ @type@ in1 = *(@type@ *)ip1;
+ @type@ in2 = *(@type@ *)ip2;
+ *(@type@ *)op1 = npy_rshift@c@(in1, in2);
+ }
+#endif
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ ** UNSIGNED INTEGER LOOPS
+ *****************************************************************************
+ */
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ * #c = u,u,u,ul,ull#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = bitwise_and, bitwise_or, bitwise_xor#
+ * #OP = &, |, ^#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if (IS_BINARY_REDUCE) {
+ BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+ }
+ else {
+ BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+ }
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ /*
+ * gcc vectorization of this is not good (PR60575) but manual integer
+ * vectorization is too tedious to be worthwhile
+ */
+ BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
+}
+/**end repeat1**/
+
+NPY_FINLINE npy_bool @TYPE@_logical_xor_(@type@ in1, @type@ in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ BINARY_LOOP_FAST(@type@, npy_bool, *out = @TYPE@_logical_xor_(in1, in2));
+}
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ /*
+ * The (void)in; suppresses an unused variable warning raised by gcc and allows
+ * us to re-use this macro even though we do not depend on in
+ */
+ UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ ** SIGNED! INTEGER LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ * #c = ,,,l,ll#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+/**begin repeat1
+ * #kind = conjugate, invert, isnan, isinf, isfinite,
+ * logical_and, logical_or, logical_xor, logical_not,
+ * bitwise_and, bitwise_or, bitwise_xor#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+ NPY_CPU_DISPATCH_CURFX(U@TYPE@_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ ** BOOLEAN LOOPS **
+ *****************************************************************************
+ */
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+ NPY_CPU_DISPATCH_CURFX(UBYTE_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ ** HALF-FLOAT LOOPS **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
+}
+
+/*
+ *****************************************************************************
+ ** DATETIME LOOPS **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #type = npy_datetime, npy_timedelta#
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+ NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func);
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 182c57b01..1fac3c150 100644
--- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -239,7 +239,7 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
#ifdef SIMD_AVX512F
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE __mmask16
avx512_get_full_load_mask_ps(void)
{
return 0xFFFF;