summaryrefslogtreecommitdiff
path: root/numpy/core/src/common
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2022-09-17 21:43:06 +0200
committerSayed Adel <seiko@imavr.com>2022-09-19 08:27:21 +0200
commit6ef4c8bc1459f5d4f548ed87715651c6bc75fc49 (patch)
tree2d33c4f264ddb7dd3b7416b75baf7b8906be96ff /numpy/core/src/common
parenta2697cac9adbeb2e9218543eed41c185182faf2a (diff)
downloadnumpy-6ef4c8bc1459f5d4f548ed87715651c6bc75fc49.tar.gz
SIMD: Add new intrinsics to check true cross all vector lanes
npyv_any_##SFX: returns true if any of the elements is not equal to zero npyv_all_##SFX: returns true if all elements are not equal to zero
Diffstat (limited to 'numpy/core/src/common')
-rw-r--r--numpy/core/src/common/simd/avx2/math.h8
-rw-r--r--numpy/core/src/common/simd/avx2/operators.h54
-rw-r--r--numpy/core/src/common/simd/avx512/math.h100
-rw-r--r--numpy/core/src/common/simd/avx512/operators.h41
-rw-r--r--numpy/core/src/common/simd/neon/math.h81
-rw-r--r--numpy/core/src/common/simd/neon/misc.h20
-rw-r--r--numpy/core/src/common/simd/neon/operators.h125
-rw-r--r--numpy/core/src/common/simd/simd.h4
-rw-r--r--numpy/core/src/common/simd/sse/math.h8
-rw-r--r--numpy/core/src/common/simd/sse/operators.h56
-rw-r--r--numpy/core/src/common/simd/vec/math.h45
-rw-r--r--numpy/core/src/common/simd/vec/operators.h26
12 files changed, 455 insertions, 113 deletions
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
index ab61944b5..5c869f911 100644
--- a/numpy/core/src/common/simd/avx2/math.h
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -172,7 +172,7 @@ NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi)
NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \
{ \
npyv_b32 notnan = npyv_notnan_f32(a); \
- if (NPY_UNLIKELY(npyv_tobits_b32(notnan) == 0)) { \
+ if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \
return _mm_cvtss_f32(_mm256_castps256_ps128(a)); \
} \
a = npyv_select_f32(notnan, a, npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \
@@ -181,7 +181,7 @@ NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi)
NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \
{ \
npyv_b64 notnan = npyv_notnan_f64(a); \
- if (NPY_UNLIKELY(npyv_tobits_b64(notnan) == 0)) { \
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \
return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); \
} \
a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \
@@ -190,7 +190,7 @@ NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi)
NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \
{ \
npyv_b32 notnan = npyv_notnan_f32(a); \
- if (NPY_UNLIKELY(npyv_tobits_b32(notnan) != 0xff)) { \
+ if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \
const union { npy_uint32 i; float f;} pnan = {0x7fc00000UL}; \
return pnan.f; \
} \
@@ -199,7 +199,7 @@ NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi)
NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
{ \
npyv_b64 notnan = npyv_notnan_f64(a); \
- if (NPY_UNLIKELY(npyv_tobits_b64(notnan) != 0xf)) { \
+ if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \
const union { npy_uint64 i; double d;} pnan = {0x7ff8000000000000ull}; \
return pnan.d; \
} \
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index 7682b24cb..c10267b21 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -225,4 +225,58 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
{ return _mm256_castpd_si256(_mm256_cmp_pd(a, a, _CMP_ORD_Q)); }
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#define NPYV_IMPL_AVX2_ANYALL(SFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return _mm256_movemask_epi8(a) != 0; } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return _mm256_movemask_epi8(a) == -1; }
+NPYV_IMPL_AVX2_ANYALL(b8)
+NPYV_IMPL_AVX2_ANYALL(b16)
+NPYV_IMPL_AVX2_ANYALL(b32)
+NPYV_IMPL_AVX2_ANYALL(b64)
+#undef NPYV_IMPL_AVX2_ANYALL
+
+#define NPYV_IMPL_AVX2_ANYALL(SFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ return _mm256_movemask_epi8( \
+ npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \
+ ) != -1; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ return _mm256_movemask_epi8( \
+ npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \
+ ) == 0; \
+ }
+NPYV_IMPL_AVX2_ANYALL(u8)
+NPYV_IMPL_AVX2_ANYALL(s8)
+NPYV_IMPL_AVX2_ANYALL(u16)
+NPYV_IMPL_AVX2_ANYALL(s16)
+NPYV_IMPL_AVX2_ANYALL(u32)
+NPYV_IMPL_AVX2_ANYALL(s32)
+NPYV_IMPL_AVX2_ANYALL(u64)
+NPYV_IMPL_AVX2_ANYALL(s64)
+#undef NPYV_IMPL_AVX2_ANYALL
+
+#define NPYV_IMPL_AVX2_ANYALL(SFX, XSFX, MASK) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ return _mm256_movemask_##XSFX( \
+ _mm256_cmp_##XSFX(a, npyv_zero_##SFX(), _CMP_EQ_OQ) \
+ ) != MASK; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ return _mm256_movemask_##XSFX( \
+ _mm256_cmp_##XSFX(a, npyv_zero_##SFX(), _CMP_EQ_OQ) \
+ ) == 0; \
+ }
+NPYV_IMPL_AVX2_ANYALL(f32, ps, 0xff)
+NPYV_IMPL_AVX2_ANYALL(f64, pd, 0xf)
+#undef NPYV_IMPL_AVX2_ANYALL
+
#endif // _NPY_SIMD_AVX2_OPERATORS_H
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
index 8f57e6b16..97fd2d641 100644
--- a/numpy/core/src/common/simd/avx512/math.h
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -151,21 +151,28 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
#define npyv_reduce_max_f64 _mm512_reduce_max_pd
#else
// reduce min&max for 32&64-bits
- #define NPY_IMPL_AVX512_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \
- NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m512i a) \
- { \
- __m256i v256 = _mm256_##VINTRIN##32(npyv512_lower_si256(a), npyv512_higher_si256(a)); \
- __m128i v128 = _mm_##VINTRIN##32(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \
- __m128i v64 = _mm_##VINTRIN##32(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \
- __m128i v32 = _mm_##VINTRIN##32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \
- return (STYPE##32)_mm_cvtsi128_si32(v32); \
- } \
- NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m512i a) \
- { \
- __m512i v256 = _mm512_##VINTRIN##64(a, _mm512_shuffle_i64x2(a, a, _MM_SHUFFLE(0, 0, 3, 2))); \
- __m512i v128 = _mm512_##VINTRIN##64(v256, _mm512_shuffle_i64x2(v256, v256, _MM_SHUFFLE(0, 0, 0, 1))); \
- __m512i v64 = _mm512_##VINTRIN##64(v128, _mm512_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \
- return (STYPE##64)npyv_extract0_u64(v64); \
+ #define NPY_IMPL_AVX512_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \
+ NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m512i a) \
+ { \
+ __m256i v256 = _mm256_##VINTRIN##32(npyv512_lower_si256(a), \
+ npyv512_higher_si256(a)); \
+ __m128i v128 = _mm_##VINTRIN##32(_mm256_castsi256_si128(v256), \
+ _mm256_extracti128_si256(v256, 1)); \
+ __m128i v64 = _mm_##VINTRIN##32(v128, _mm_shuffle_epi32(v128, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = _mm_##VINTRIN##32(v64, _mm_shuffle_epi32(v64, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ return (STYPE##32)_mm_cvtsi128_si32(v32); \
+ } \
+ NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m512i a) \
+ { \
+ __m512i v256 = _mm512_##VINTRIN##64(a, \
+ _mm512_shuffle_i64x2(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m512i v128 = _mm512_##VINTRIN##64(v256, \
+ _mm512_shuffle_i64x2(v256, v256, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ __m512i v64 = _mm512_##VINTRIN##64(v128, \
+ _mm512_shuffle_epi32(v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ return (STYPE##64)npyv_extract0_u64(v64); \
}
NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, min_u, min_epu)
@@ -174,21 +181,28 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, max_s, max_epi)
#undef NPY_IMPL_AVX512_REDUCE_MINMAX
// reduce min&max for ps & pd
- #define NPY_IMPL_AVX512_REDUCE_MINMAX(INTRIN) \
- NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \
- { \
- __m256 v256 = _mm256_##INTRIN##_ps(npyv512_lower_ps256(a), npyv512_higher_ps256(a)); \
- __m128 v128 = _mm_##INTRIN##_ps(_mm256_castps256_ps128(v256), _mm256_extractf128_ps(v256, 1)); \
- __m128 v64 = _mm_##INTRIN##_ps(v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(0, 0, 3, 2))); \
- __m128 v32 = _mm_##INTRIN##_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); \
- return _mm_cvtss_f32(v32); \
- } \
- NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \
- { \
- __m256d v256 = _mm256_##INTRIN##_pd(npyv512_lower_pd256(a), npyv512_higher_pd256(a)); \
- __m128d v128 = _mm_##INTRIN##_pd(_mm256_castpd256_pd128(v256), _mm256_extractf128_pd(v256, 1)); \
- __m128d v64 = _mm_##INTRIN##_pd(v128, _mm_shuffle_pd(v128, v128, _MM_SHUFFLE(0, 0, 0, 1))); \
- return _mm_cvtsd_f64(v64); \
+ #define NPY_IMPL_AVX512_REDUCE_MINMAX(INTRIN) \
+ NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \
+ { \
+ __m256 v256 = _mm256_##INTRIN##_ps( \
+ npyv512_lower_ps256(a), npyv512_higher_ps256(a)); \
+ __m128 v128 = _mm_##INTRIN##_ps( \
+ _mm256_castps256_ps128(v256), _mm256_extractf128_ps(v256, 1)); \
+ __m128 v64 = _mm_##INTRIN##_ps(v128, \
+ _mm_shuffle_ps(v128, v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128 v32 = _mm_##INTRIN##_ps(v64, \
+ _mm_shuffle_ps(v64, v64, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ return _mm_cvtss_f32(v32); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \
+ { \
+ __m256d v256 = _mm256_##INTRIN##_pd( \
+ npyv512_lower_pd256(a), npyv512_higher_pd256(a)); \
+ __m128d v128 = _mm_##INTRIN##_pd( \
+ _mm256_castpd256_pd128(v256), _mm256_extractf128_pd(v256, 1)); \
+ __m128d v64 = _mm_##INTRIN##_pd(v128, \
+ _mm_shuffle_pd(v128, v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ return _mm_cvtsd_f64(v64); \
}
NPY_IMPL_AVX512_REDUCE_MINMAX(min)
@@ -199,7 +213,7 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \
{ \
npyv_b32 notnan = npyv_notnan_f32(a); \
- if (NPY_UNLIKELY(npyv_tobits_b32(notnan) == 0)) { \
+ if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \
return _mm_cvtss_f32(_mm512_castps512_ps128(a)); \
} \
a = npyv_select_f32(notnan, a, \
@@ -209,7 +223,7 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \
{ \
npyv_b64 notnan = npyv_notnan_f64(a); \
- if (NPY_UNLIKELY(npyv_tobits_b64(notnan) == 0)) { \
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \
return _mm_cvtsd_f64(_mm512_castpd512_pd128(a)); \
} \
a = npyv_select_f64(notnan, a, \
@@ -219,7 +233,7 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \
{ \
npyv_b32 notnan = npyv_notnan_f32(a); \
- if (NPY_UNLIKELY(npyv_tobits_b32(notnan) != 0xffff)) { \
+ if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \
const union { npy_uint32 i; float f;} pnan = { \
0x7fc00000ul \
}; \
@@ -230,7 +244,7 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
{ \
npyv_b64 notnan = npyv_notnan_f64(a); \
- if (NPY_UNLIKELY(npyv_tobits_b64(notnan) != 0xff)) { \
+ if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \
const union { npy_uint64 i; double d;} pnan = { \
0x7ff8000000000000ull \
}; \
@@ -249,18 +263,24 @@ NPY_IMPL_AVX512_REDUCE_MINMAX(max, 0xff800000, 0xfff0000000000000)
{ \
__m256i v256 = _mm256_##VINTRIN##16(npyv512_lower_si256(a), npyv512_higher_si256(a)); \
__m128i v128 = _mm_##VINTRIN##16(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \
- __m128i v64 = _mm_##VINTRIN##16(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \
- __m128i v32 = _mm_##VINTRIN##16(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \
- __m128i v16 = _mm_##VINTRIN##16(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v64 = _mm_##VINTRIN##16(v128, _mm_shuffle_epi32(v128, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = _mm_##VINTRIN##16(v64, _mm_shuffle_epi32(v64, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v16 = _mm_##VINTRIN##16(v32, _mm_shufflelo_epi16(v32, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
return (STYPE##16)_mm_cvtsi128_si32(v16); \
} \
NPY_FINLINE STYPE##8 npyv_reduce_##INTRIN##8(__m512i a) \
{ \
__m256i v256 = _mm256_##VINTRIN##8(npyv512_lower_si256(a), npyv512_higher_si256(a)); \
__m128i v128 = _mm_##VINTRIN##8(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \
- __m128i v64 = _mm_##VINTRIN##8(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \
- __m128i v32 = _mm_##VINTRIN##8(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \
- __m128i v16 = _mm_##VINTRIN##8(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v64 = _mm_##VINTRIN##8(v128, _mm_shuffle_epi32(v128, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = _mm_##VINTRIN##8(v64, _mm_shuffle_epi32(v64, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v16 = _mm_##VINTRIN##8(v32, _mm_shufflelo_epi16(v32, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
__m128i v8 = _mm_##VINTRIN##8(v16, _mm_srli_epi16(v16, 8)); \
return (STYPE##16)_mm_cvtsi128_si32(v8); \
}
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index 804cd24e8..c70932d5f 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -5,6 +5,8 @@
#ifndef _NPY_SIMD_AVX512_OPERATORS_H
#define _NPY_SIMD_AVX512_OPERATORS_H
+#include "conversion.h" // tobits
+
/***************************
* Shifting
***************************/
@@ -336,4 +338,43 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
{ return _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q); }
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#define NPYV_IMPL_AVX512_ANYALL(SFX, MASK) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return npyv_tobits_##SFX(a) != 0; } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return npyv_tobits_##SFX(a) == MASK; }
+NPYV_IMPL_AVX512_ANYALL(b8, 0xffffffffffffffffull)
+NPYV_IMPL_AVX512_ANYALL(b16, 0xfffffffful)
+NPYV_IMPL_AVX512_ANYALL(b32, 0xffff)
+NPYV_IMPL_AVX512_ANYALL(b64, 0xff)
+#undef NPYV_IMPL_AVX512_ANYALL
+
+#define NPYV_IMPL_AVX512_ANYALL(SFX, BSFX, MASK) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ return npyv_tobits_##BSFX( \
+ npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \
+ ) != MASK; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ return npyv_tobits_##BSFX( \
+ npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \
+ ) == 0; \
+ }
+NPYV_IMPL_AVX512_ANYALL(u8, b8, 0xffffffffffffffffull)
+NPYV_IMPL_AVX512_ANYALL(s8, b8, 0xffffffffffffffffull)
+NPYV_IMPL_AVX512_ANYALL(u16, b16, 0xfffffffful)
+NPYV_IMPL_AVX512_ANYALL(s16, b16, 0xfffffffful)
+NPYV_IMPL_AVX512_ANYALL(u32, b32, 0xffff)
+NPYV_IMPL_AVX512_ANYALL(s32, b32, 0xffff)
+NPYV_IMPL_AVX512_ANYALL(u64, b64, 0xff)
+NPYV_IMPL_AVX512_ANYALL(s64, b64, 0xff)
+NPYV_IMPL_AVX512_ANYALL(f32, b32, 0xffff)
+NPYV_IMPL_AVX512_ANYALL(f64, b64, 0xff)
+#undef NPYV_IMPL_AVX512_ANYALL
+
#endif // _NPY_SIMD_AVX512_OPERATORS_H
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 0b4cf93f2..c0a771b5d 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -169,8 +169,6 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
#define npyv_reduce_max_s16 vmaxvq_s16
#define npyv_reduce_max_u32 vmaxvq_u32
#define npyv_reduce_max_s32 vmaxvq_s32
- #define npyv_reduce_max_u64 vmaxvq_u64
- #define npyv_reduce_max_s64 vmaxvq_s64
#define npyv_reduce_max_f32 vmaxvq_f32
#define npyv_reduce_max_f64 vmaxvq_f64
@@ -185,8 +183,6 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
#define npyv_reduce_min_s16 vminvq_s16
#define npyv_reduce_min_u32 vminvq_u32
#define npyv_reduce_min_s32 vminvq_s32
- #define npyv_reduce_min_u64 vminvq_u64
- #define npyv_reduce_min_s64 vminvq_s64
#define npyv_reduce_min_f32 vminvq_f32
#define npyv_reduce_min_f64 vminvq_f64
@@ -195,14 +191,14 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
#define npyv_reduce_minp_f32 vminnmvq_f32
#define npyv_reduce_minp_f64 vminnmvq_f64
#else
- #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
- NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
- { \
- STYPE##x8_t r = v##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \
- r = v##INTRIN##_##SFX(r, vrev64_##SFX(r)); \
- r = vp##INTRIN##_##SFX(r, r); \
- r = vp##INTRIN##_##SFX(r, r); \
- return (npy_##STYPE)vget_lane_##SFX(r, 0); \
+ #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ STYPE##x8_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ return (npy_##STYPE)vget_lane_##SFX(r, 0); \
}
NPY_IMPL_NEON_REDUCE_MINMAX(min, uint8, u8)
NPY_IMPL_NEON_REDUCE_MINMAX(max, uint8, u8)
@@ -210,13 +206,13 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
NPY_IMPL_NEON_REDUCE_MINMAX(max, int8, s8)
#undef NPY_IMPL_NEON_REDUCE_MINMAX
- #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
- NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
- { \
- STYPE##x4_t r = v##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \
- r = v##INTRIN##_##SFX(r, vrev64_##SFX(r)); \
- r = vp##INTRIN##_##SFX(r, r); \
- return (npy_##STYPE)vget_lane_##SFX(r, 0); \
+ #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ STYPE##x4_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ return (npy_##STYPE)vget_lane_##SFX(r, 0); \
}
NPY_IMPL_NEON_REDUCE_MINMAX(min, uint16, u16)
NPY_IMPL_NEON_REDUCE_MINMAX(max, uint16, u16)
@@ -224,12 +220,12 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
NPY_IMPL_NEON_REDUCE_MINMAX(max, int16, s16)
#undef NPY_IMPL_NEON_REDUCE_MINMAX
- #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
- NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
- { \
- STYPE##x2_t r = v##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \
- r = v##INTRIN##_##SFX(r, vrev64_##SFX(r)); \
- return (npy_##STYPE)vget_lane_##SFX(r, 0); \
+ #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ STYPE##x2_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ return (npy_##STYPE)vget_lane_##SFX(r, 0); \
}
NPY_IMPL_NEON_REDUCE_MINMAX(min, uint32, u32)
NPY_IMPL_NEON_REDUCE_MINMAX(max, uint32, u32)
@@ -237,31 +233,18 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
NPY_IMPL_NEON_REDUCE_MINMAX(max, int32, s32)
#undef NPY_IMPL_NEON_REDUCE_MINMAX
- #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, OP, STYPE, SFX) \
- NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
- { \
- npy_##STYPE a0 = (npy_##STYPE)vget_low_##SFX(a, 0); \
- npy_##STYPE a1 = (npy_##STYPE)vget_low_##SFX(a, 1); \
- return a0 OP a1 ? a0 : a1; \
- }
- NPY_IMPL_NEON_REDUCE_MINMAX(min, <, uint64, u64)
- NPY_IMPL_NEON_REDUCE_MINMAX(max, >, uint64, u64)
- NPY_IMPL_NEON_REDUCE_MINMAX(min, <, int64, s64)
- NPY_IMPL_NEON_REDUCE_MINMAX(max, >, int64, s64)
- #undef NPY_IMPL_NEON_REDUCE_MINMAX
-
#define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, INF) \
NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \
{ \
- float32x2_t r = v##INTRIN##_f32(vget_low_f32(a), vget_high_f32(a)); \
- r = v##INTRIN##_f32(r, vrev64_f32(r)); \
+ float32x2_t r = vp##INTRIN##_f32(vget_low_f32(a), vget_high_f32(a));\
+ r = vp##INTRIN##_f32(r, r); \
return vget_lane_f32(r, 0); \
} \
NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \
{ \
npyv_b32 notnan = npyv_notnan_f32(a); \
- if (vget_lane_u32(notnan, 0) != 0) { \
- return vget_lane_f32(a, 0); \
+ if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \
+ return vgetq_lane_f32(a, 0); \
} \
a = npyv_select_f32(notnan, a, \
npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \
@@ -274,7 +257,19 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
NPY_IMPL_NEON_REDUCE_MINMAX(min, 0x7f800000)
NPY_IMPL_NEON_REDUCE_MINMAX(max, 0xff800000)
#undef NPY_IMPL_NEON_REDUCE_MINMAX
-#endif
+#endif // NPY_SIMD_F64
+#define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX, OP) \
+ NPY_FINLINE STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ STYPE al = (STYPE)vget_low_##SFX(a); \
+ STYPE ah = (STYPE)vget_high_##SFX(a); \
+ return al OP ah ? al : ah; \
+ }
+NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_uint64, u64, >)
+NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_int64, s64, >)
+NPY_IMPL_NEON_REDUCE_MINMAX(min, npy_uint64, u64, <)
+NPY_IMPL_NEON_REDUCE_MINMAX(min, npy_int64, s64, <)
+#undef NPY_IMPL_NEON_REDUCE_MINMAX
// round to nearest integer even
NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
diff --git a/numpy/core/src/common/simd/neon/misc.h b/numpy/core/src/common/simd/neon/misc.h
index f1f0a5cc5..5fe109c13 100644
--- a/numpy/core/src/common/simd/neon/misc.h
+++ b/numpy/core/src/common/simd/neon/misc.h
@@ -139,16 +139,16 @@ NPY_FINLINE float64x2_t npyv__set_f64(double i0, double i1)
#define npyv_select_f64 vbslq_f64
// extract the first vector's lane
-#define npyv_extract0_u8(A) ((npy_uint8)vget_lane_u8((A, 0))
-#define npyv_extract0_s8(A) ((npy_int8)vget_lane_s8((A, 0))
-#define npyv_extract0_u16(A) ((npy_uint16)vget_lane_u16((A, 0))
-#define npyv_extract0_s16(A) ((npy_int16)vget_lane_s16((A, 0))
-#define npyv_extract0_u32(A) ((npy_uint32)vget_lane_u32((A, 0))
-#define npyv_extract0_s32(A) ((npy_int32)vget_lane_s32((A, 0))
-#define npyv_extract0_u64(A) ((npy_uint64)vget_lane_u64((A, 0))
-#define npyv_extract0_s64(A) ((npy_int64)vget_lane_s64((A, 0))
-#define npyv_extract0_f32(A) vget_lane_f32(A, 0)
-#define npyv_extract0_f64(A) vget_lane_f64(A, 0)
+#define npyv_extract0_u8(A) ((npy_uint8)vgetq_lane_u8(A, 0))
+#define npyv_extract0_s8(A) ((npy_int8)vgetq_lane_s8(A, 0))
+#define npyv_extract0_u16(A) ((npy_uint16)vgetq_lane_u16(A, 0))
+#define npyv_extract0_s16(A) ((npy_int16)vgetq_lane_s16(A, 0))
+#define npyv_extract0_u32(A) ((npy_uint32)vgetq_lane_u32(A, 0))
+#define npyv_extract0_s32(A) ((npy_int32)vgetq_lane_s32(A, 0))
+#define npyv_extract0_u64(A) ((npy_uint64)vgetq_lane_u64(A, 0))
+#define npyv_extract0_s64(A) ((npy_int64)vgetq_lane_s64(A, 0))
+#define npyv_extract0_f32(A) vgetq_lane_f32(A, 0)
+#define npyv_extract0_f64(A) vgetq_lane_f64(A, 0)
// Reinterpret
#define npyv_reinterpret_u8_u8(X) X
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index a08fa5390..249621bd6 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -246,4 +246,129 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
{ return vceqq_f64(a, a); }
#endif
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#if NPY_SIMD_F64
+ #define NPYV_IMPL_NEON_ANYALL(LEN) \
+ NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a) \
+ { return vmaxvq_u##LEN(a) != 0; } \
+ NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a) \
+ { return vminvq_u##LEN(a) != 0; }
+ NPYV_IMPL_NEON_ANYALL(8)
+ NPYV_IMPL_NEON_ANYALL(16)
+ NPYV_IMPL_NEON_ANYALL(32)
+ #undef NPYV_IMPL_NEON_ANYALL
+
+ #define NPYV_IMPL_NEON_ANYALL(SFX, USFX, BSFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return npyv_any_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return npyv_all_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); }
+ NPYV_IMPL_NEON_ANYALL(u8, u8, b8)
+ NPYV_IMPL_NEON_ANYALL(s8, u8, b8)
+ NPYV_IMPL_NEON_ANYALL(u16, u16, b16)
+ NPYV_IMPL_NEON_ANYALL(s16, u16, b16)
+ NPYV_IMPL_NEON_ANYALL(u32, u32, b32)
+ NPYV_IMPL_NEON_ANYALL(s32, u32, b32)
+ #undef NPYV_IMPL_NEON_ANYALL
+
+ NPY_FINLINE bool npyv_any_b64(npyv_b64 a)
+ { return vmaxvq_u32(vreinterpretq_u32_u64(a)) != 0; }
+ NPY_FINLINE bool npyv_all_b64(npyv_b64 a)
+ { return vminvq_u32(vreinterpretq_u32_u64(a)) != 0; }
+ #define npyv_any_u64 npyv_any_b64
+ NPY_FINLINE bool npyv_all_u64(npyv_u64 a)
+ {
+ uint32x4_t a32 = vreinterpretq_u32_u64(a);
+ a32 = vorrq_u32(a32, vrev64q_u32(a32));
+ return vminvq_u32(a32) != 0;
+ }
+ NPY_FINLINE bool npyv_any_s64(npyv_s64 a)
+ { return npyv_any_u64(vreinterpretq_u64_s64(a)); }
+ NPY_FINLINE bool npyv_all_s64(npyv_s64 a)
+ { return npyv_all_u64(vreinterpretq_u64_s64(a)); }
+
+ #define NPYV_IMPL_NEON_ANYALL(SFX, BSFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return !npyv_all_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return !npyv_any_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); }
+ NPYV_IMPL_NEON_ANYALL(f32, b32)
+ NPYV_IMPL_NEON_ANYALL(f64, b64)
+ #undef NPYV_IMPL_NEON_ANYALL
+#else
+ #define NPYV_IMPL_NEON_ANYALL(LEN) \
+ NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a) \
+ { \
+ int64x2_t a64 = vreinterpretq_s64_u##LEN(a); \
+ return ( \
+ vgetq_lane_s64(a64, 0) | \
+ vgetq_lane_s64(a64, 1) \
+ ) != 0; \
+ } \
+ NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a) \
+ { \
+ int64x2_t a64 = vreinterpretq_s64_u##LEN(a); \
+ return ( \
+ vgetq_lane_s64(a64, 0) & \
+ vgetq_lane_s64(a64, 1) \
+ ) == -1; \
+ }
+ NPYV_IMPL_NEON_ANYALL(8)
+ NPYV_IMPL_NEON_ANYALL(16)
+ NPYV_IMPL_NEON_ANYALL(32)
+ NPYV_IMPL_NEON_ANYALL(64)
+ #undef NPYV_IMPL_NEON_ANYALL
+
+ #define NPYV_IMPL_NEON_ANYALL(SFX, USFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ int64x2_t a64 = vreinterpretq_s64_##SFX(a); \
+ return ( \
+ vgetq_lane_s64(a64, 0) | \
+ vgetq_lane_s64(a64, 1) \
+ ) != 0; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ npyv_##USFX tz = npyv_cmpeq_##SFX( \
+ a, npyv_zero_##SFX() \
+ ); \
+ int64x2_t a64 = vreinterpretq_s64_##USFX(tz); \
+ return ( \
+ vgetq_lane_s64(a64, 0) | \
+ vgetq_lane_s64(a64, 1) \
+ ) == 0; \
+ }
+ NPYV_IMPL_NEON_ANYALL(u8, u8)
+ NPYV_IMPL_NEON_ANYALL(s8, u8)
+ NPYV_IMPL_NEON_ANYALL(u16, u16)
+ NPYV_IMPL_NEON_ANYALL(s16, u16)
+ NPYV_IMPL_NEON_ANYALL(u32, u32)
+ NPYV_IMPL_NEON_ANYALL(s32, u32)
+ #undef NPYV_IMPL_NEON_ANYALL
+
+ NPY_FINLINE bool npyv_any_f32(npyv_f32 a)
+ {
+ uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32());
+ int64x2_t a64 = vreinterpretq_s64_u32(tz);
+ return (vgetq_lane_s64(a64, 0) & vgetq_lane_s64(a64, 1)) != -1ll;
+ }
+ NPY_FINLINE bool npyv_all_f32(npyv_f32 a)
+ {
+ uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32());
+ int64x2_t a64 = vreinterpretq_s64_u32(tz);
+ return (vgetq_lane_s64(a64, 0) | vgetq_lane_s64(a64, 1)) == 0;
+ }
+ NPY_FINLINE bool npyv_any_s64(npyv_s64 a)
+ { return (vgetq_lane_s64(a, 0) | vgetq_lane_s64(a, 1)) != 0; }
+ NPY_FINLINE bool npyv_all_s64(npyv_s64 a)
+ { return vgetq_lane_s64(a, 0) && vgetq_lane_s64(a, 1); }
+ NPY_FINLINE bool npyv_any_u64(npyv_u64 a)
+ { return (vgetq_lane_u64(a, 0) | vgetq_lane_u64(a, 1)) != 0; }
+ NPY_FINLINE bool npyv_all_u64(npyv_u64 a)
+ { return vgetq_lane_u64(a, 0) && vgetq_lane_u64(a, 1); }
+#endif // NPY_SIMD_F64
+
#endif // _NPY_SIMD_NEON_OPERATORS_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index b1492500f..92a77ad80 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -8,6 +8,10 @@
* TODO: Add an independent sphinx doc.
*/
#include "numpy/npy_common.h"
+#ifndef __cplusplus
+ #include <stdbool.h>
+#endif
+
#include "npy_cpu_dispatch.h"
#include "simd_utils.h"
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index 83cfdd18b..b7f8e6ebb 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -202,7 +202,7 @@ NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi)
NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \
{ \
npyv_b32 notnan = npyv_notnan_f32(a); \
- if (NPY_UNLIKELY(npyv_tobits_b32(notnan) == 0)) { \
+ if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \
return _mm_cvtss_f32(a); \
} \
a = npyv_select_f32(notnan, a, npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \
@@ -211,7 +211,7 @@ NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi)
NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \
{ \
npyv_b64 notnan = npyv_notnan_f64(a); \
- if (NPY_UNLIKELY(npyv_tobits_b64(notnan) == 0)) { \
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \
return _mm_cvtsd_f64(a); \
} \
a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \
@@ -220,7 +220,7 @@ NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi)
NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \
{ \
npyv_b32 notnan = npyv_notnan_f32(a); \
- if (NPY_UNLIKELY(npyv_tobits_b32(notnan) != 0xf)) { \
+ if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \
const union { npy_uint32 i; float f;} pnan = {0x7fc00000UL}; \
return pnan.f; \
} \
@@ -229,7 +229,7 @@ NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi)
NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
{ \
npyv_b64 notnan = npyv_notnan_f64(a); \
- if (NPY_UNLIKELY(npyv_tobits_b64(notnan) != 0x3)) { \
+ if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \
const union { npy_uint64 i; double d;} pnan = {0x7ff8000000000000ull}; \
return pnan.d; \
} \
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 86dbcfea5..59182679e 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -283,4 +283,60 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
{ return _mm_castpd_si128(_mm_cmpord_pd(a, a)); }
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#define NPYV_IMPL_SSE_ANYALL(SFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return _mm_movemask_epi8(a) != 0; } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return _mm_movemask_epi8(a) == 0xffff; }
+NPYV_IMPL_SSE_ANYALL(b8)
+NPYV_IMPL_SSE_ANYALL(b16)
+NPYV_IMPL_SSE_ANYALL(b32)
+NPYV_IMPL_SSE_ANYALL(b64)
+#undef NPYV_IMPL_SSE_ANYALL
+
+#define NPYV_IMPL_SSE_ANYALL(SFX, MSFX, TSFX, MASK) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ return _mm_movemask_##MSFX( \
+ _mm_cmpeq_##TSFX(a, npyv_zero_##SFX()) \
+ ) != MASK; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ return _mm_movemask_##MSFX( \
+ _mm_cmpeq_##TSFX(a, npyv_zero_##SFX()) \
+ ) == 0; \
+ }
+NPYV_IMPL_SSE_ANYALL(u8, epi8, epi8, 0xffff)
+NPYV_IMPL_SSE_ANYALL(s8, epi8, epi8, 0xffff)
+NPYV_IMPL_SSE_ANYALL(u16, epi8, epi16, 0xffff)
+NPYV_IMPL_SSE_ANYALL(s16, epi8, epi16, 0xffff)
+NPYV_IMPL_SSE_ANYALL(u32, epi8, epi32, 0xffff)
+NPYV_IMPL_SSE_ANYALL(s32, epi8, epi32, 0xffff)
+#ifdef NPY_HAVE_SSE41
+ NPYV_IMPL_SSE_ANYALL(u64, epi8, epi64, 0xffff)
+ NPYV_IMPL_SSE_ANYALL(s64, epi8, epi64, 0xffff)
+#else
+ NPY_FINLINE bool npyv_any_u64(npyv_u64 a)
+ {
+ return _mm_movemask_epi8(
+ _mm_cmpeq_epi32(a, npyv_zero_u64())
+ ) != 0xffff;
+ }
+ NPY_FINLINE bool npyv_all_u64(npyv_u64 a)
+ {
+ a = _mm_cmpeq_epi32(a, npyv_zero_u64());
+ a = _mm_and_si128(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1)));
+ return _mm_movemask_epi8(a) == 0;
+ }
+ #define npyv_any_s64 npyv_any_u64
+ #define npyv_all_s64 npyv_all_u64
+#endif
+NPYV_IMPL_SSE_ANYALL(f32, ps, ps, 0xf)
+NPYV_IMPL_SSE_ANYALL(f64, pd, pd, 0x3)
+#undef NPYV_IMPL_SSE_ANYALL
+
#endif // _NPY_SIMD_SSE_OPERATORS_H
diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h
index 7ef529e21..95b16fdf7 100644
--- a/numpy/core/src/common/simd/vec/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -69,7 +69,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
npyv_b32 nn_a = npyv_notnan_f32(a);
npyv_b32 nn_b = npyv_notnan_f32(b);
npyv_f32 max = vec_max(a, b);
- return vec_sel(a, vec_sel(b, max, nn_b), nn_a);
+ return vec_sel(b, vec_sel(a, max, nn_a), nn_b);
}
#endif
NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b)
@@ -77,7 +77,7 @@ NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b)
npyv_b64 nn_a = npyv_notnan_f64(a);
npyv_b64 nn_b = npyv_notnan_f64(b);
npyv_f64 max = vec_max(a, b);
- return vec_sel(a, vec_sel(b, max, nn_b), nn_a);
+ return vec_sel(b, vec_sel(a, max, nn_a), nn_b);
}
// Maximum, integer operations
@@ -118,15 +118,15 @@ NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b)
npyv_b32 nn_a = npyv_notnan_f32(a);
npyv_b32 nn_b = npyv_notnan_f32(b);
npyv_f32 min = vec_min(a, b);
- return vec_sel(a, vec_sel(b, min, nn_b), nn_a);
+ return vec_sel(b, vec_sel(a, min, nn_a), nn_b);
}
#endif
NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
{
- npyv_b32 nn_a = npyv_notnan_f64(a);
- npyv_b32 nn_b = npyv_notnan_f64(b);
+ npyv_b64 nn_a = npyv_notnan_f64(a);
+ npyv_b64 nn_b = npyv_notnan_f64(b);
npyv_f64 min = vec_min(a, b);
- return vec_sel(a, vec_sel(b, min, nn_b), nn_a);
+ return vec_sel(b, vec_sel(a, min, nn_a), nn_b);
}
// Minimum, integer operations
@@ -208,7 +208,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \
{ \
npyv_b32 notnan = npyv_notnan_f32(a); \
- if (NPY_UNLIKELY(!vec_all(notnan))) { \
+ if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \
const union { npy_uint32 i; float f;} \
pnan = {0x7fc00000UL}; \
return pnan.f; \
@@ -226,14 +226,10 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
return vec_extract(r, 0); \
} \
- NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \
- { \
- return npyv_reduce_##INTRIN##_f64(a); \
- } \
NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
{ \
npyv_b64 notnan = npyv_notnan_f64(a); \
- if (NPY_UNLIKELY(!vec_all(notnan))) { \
+ if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \
const union { npy_uint64 i; double f;} \
pnan = {0x7ff8000000000000ull}; \
return pnan.f; \
@@ -244,6 +240,31 @@ NPY_IMPL_VEC_REDUCE_MINMAX(min, 0x7ff0000000000000)
NPY_IMPL_VEC_REDUCE_MINMAX(max, 0xfff0000000000000)
#undef NPY_IMPL_VEC_REDUCE_MINMAX
+#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX)
+ #define npyv_reduce_minp_f64 npyv_reduce_min_f64
+ #define npyv_reduce_maxp_f64 npyv_reduce_max_f64
+#else
+ NPY_FINLINE double npyv_reduce_minp_f64(npyv_f64 a)
+ {
+ npyv_b64 notnan = npyv_notnan_f64(a);
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) {
+ return vec_extract(a, 0);
+ }
+ a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(
+ npyv_setall_u64(0x7ff0000000000000)));
+ return npyv_reduce_min_f64(a);
+ }
+ NPY_FINLINE double npyv_reduce_maxp_f64(npyv_f64 a)
+ {
+ npyv_b64 notnan = npyv_notnan_f64(a);
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) {
+ return vec_extract(a, 0);
+ }
+ a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(
+ npyv_setall_u64(0xfff0000000000000)));
+ return npyv_reduce_max_f64(a);
+ }
+#endif
// round to nearest int even
#define npyv_rint_f64 vec_rint
// ceil
diff --git a/numpy/core/src/common/simd/vec/operators.h b/numpy/core/src/common/simd/vec/operators.h
index 8b58676e7..50dac20f6 100644
--- a/numpy/core/src/common/simd/vec/operators.h
+++ b/numpy/core/src/common/simd/vec/operators.h
@@ -274,4 +274,30 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
{ return vec_cmpeq(a, a); }
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#define NPYV_IMPL_VEC_ANYALL(SFX, SFX2) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return vec_any_ne(a, (npyv_##SFX)npyv_zero_##SFX2()); } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return vec_all_ne(a, (npyv_##SFX)npyv_zero_##SFX2()); }
+NPYV_IMPL_VEC_ANYALL(b8, u8)
+NPYV_IMPL_VEC_ANYALL(b16, u16)
+NPYV_IMPL_VEC_ANYALL(b32, u32)
+NPYV_IMPL_VEC_ANYALL(b64, u64)
+NPYV_IMPL_VEC_ANYALL(u8, u8)
+NPYV_IMPL_VEC_ANYALL(s8, s8)
+NPYV_IMPL_VEC_ANYALL(u16, u16)
+NPYV_IMPL_VEC_ANYALL(s16, s16)
+NPYV_IMPL_VEC_ANYALL(u32, u32)
+NPYV_IMPL_VEC_ANYALL(s32, s32)
+NPYV_IMPL_VEC_ANYALL(u64, u64)
+NPYV_IMPL_VEC_ANYALL(s64, s64)
+#if NPY_SIMD_F32
+ NPYV_IMPL_VEC_ANYALL(f32, f32)
+#endif
+NPYV_IMPL_VEC_ANYALL(f64, f64)
+#undef NPYV_IMPL_VEC_ANYALL
+
#endif // _NPY_SIMD_VEC_OPERATORS_H