diff options
| author | Sayed Adel <seiko@imavr.com> | 2022-09-17 21:43:06 +0200 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2022-09-19 08:27:21 +0200 |
| commit | 6ef4c8bc1459f5d4f548ed87715651c6bc75fc49 (patch) | |
| tree | 2d33c4f264ddb7dd3b7416b75baf7b8906be96ff /numpy/core/src/common | |
| parent | a2697cac9adbeb2e9218543eed41c185182faf2a (diff) | |
| download | numpy-6ef4c8bc1459f5d4f548ed87715651c6bc75fc49.tar.gz | |
SIMD: Add new intrinsics to check true cross all vector lanes
npyv_any_##SFX: returns true if any of the elements is not equal to zero
npyv_all_##SFX: returns true if all elements are not equal to zero
Diffstat (limited to 'numpy/core/src/common')
| -rw-r--r-- | numpy/core/src/common/simd/avx2/math.h | 8 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx2/operators.h | 54 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/math.h | 100 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/operators.h | 41 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/neon/math.h | 81 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/neon/misc.h | 20 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/neon/operators.h | 125 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/simd.h | 4 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/math.h | 8 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/operators.h | 56 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/math.h | 45 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/operators.h | 26 |
12 files changed, 455 insertions, 113 deletions
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h index ab61944b5..5c869f911 100644 --- a/numpy/core/src/common/simd/avx2/math.h +++ b/numpy/core/src/common/simd/avx2/math.h @@ -172,7 +172,7 @@ NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi) NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ { \ npyv_b32 notnan = npyv_notnan_f32(a); \ - if (NPY_UNLIKELY(npyv_tobits_b32(notnan) == 0)) { \ + if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \ return _mm_cvtss_f32(_mm256_castps256_ps128(a)); \ } \ a = npyv_select_f32(notnan, a, npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \ @@ -181,7 +181,7 @@ NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi) NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \ { \ npyv_b64 notnan = npyv_notnan_f64(a); \ - if (NPY_UNLIKELY(npyv_tobits_b64(notnan) == 0)) { \ + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \ return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); \ } \ a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \ @@ -190,7 +190,7 @@ NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi) NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ { \ npyv_b32 notnan = npyv_notnan_f32(a); \ - if (NPY_UNLIKELY(npyv_tobits_b32(notnan) != 0xff)) { \ + if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \ const union { npy_uint32 i; float f;} pnan = {0x7fc00000UL}; \ return pnan.f; \ } \ @@ -199,7 +199,7 @@ NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi) NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ { \ npyv_b64 notnan = npyv_notnan_f64(a); \ - if (NPY_UNLIKELY(npyv_tobits_b64(notnan) != 0xf)) { \ + if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \ const union { npy_uint64 i; double d;} pnan = {0x7ff8000000000000ull}; \ return pnan.d; \ } \ diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index 7682b24cb..c10267b21 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -225,4 +225,58 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return _mm256_castpd_si256(_mm256_cmp_pd(a, a, _CMP_ORD_Q)); } +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#define NPYV_IMPL_AVX2_ANYALL(SFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return _mm256_movemask_epi8(a) != 0; } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return _mm256_movemask_epi8(a) == -1; } +NPYV_IMPL_AVX2_ANYALL(b8) +NPYV_IMPL_AVX2_ANYALL(b16) +NPYV_IMPL_AVX2_ANYALL(b32) +NPYV_IMPL_AVX2_ANYALL(b64) +#undef NPYV_IMPL_AVX2_ANYALL + +#define NPYV_IMPL_AVX2_ANYALL(SFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + return _mm256_movemask_epi8( \ + npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \ + ) != -1; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + return _mm256_movemask_epi8( \ + npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \ + ) == 0; \ + } +NPYV_IMPL_AVX2_ANYALL(u8) +NPYV_IMPL_AVX2_ANYALL(s8) +NPYV_IMPL_AVX2_ANYALL(u16) +NPYV_IMPL_AVX2_ANYALL(s16) +NPYV_IMPL_AVX2_ANYALL(u32) +NPYV_IMPL_AVX2_ANYALL(s32) +NPYV_IMPL_AVX2_ANYALL(u64) +NPYV_IMPL_AVX2_ANYALL(s64) +#undef NPYV_IMPL_AVX2_ANYALL + +#define NPYV_IMPL_AVX2_ANYALL(SFX, XSFX, MASK) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + return _mm256_movemask_##XSFX( \ + _mm256_cmp_##XSFX(a, npyv_zero_##SFX(), _CMP_EQ_OQ) \ + ) != MASK; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + return _mm256_movemask_##XSFX( \ + _mm256_cmp_##XSFX(a, npyv_zero_##SFX(), _CMP_EQ_OQ) \ + ) == 0; \ + } +NPYV_IMPL_AVX2_ANYALL(f32, ps, 0xff) +NPYV_IMPL_AVX2_ANYALL(f64, pd, 0xf) +#undef NPYV_IMPL_AVX2_ANYALL + #endif // _NPY_SIMD_AVX2_OPERATORS_H diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h index 8f57e6b16..97fd2d641 100644 --- a/numpy/core/src/common/simd/avx512/math.h +++ b/numpy/core/src/common/simd/avx512/math.h @@ -151,21 +151,28 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) #define npyv_reduce_max_f64 _mm512_reduce_max_pd #else // reduce min&max for 32&64-bits - #define NPY_IMPL_AVX512_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \ - NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m512i a) \ - { \ - __m256i v256 = _mm256_##VINTRIN##32(npyv512_lower_si256(a), npyv512_higher_si256(a)); \ - __m128i v128 = _mm_##VINTRIN##32(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \ - __m128i v64 = _mm_##VINTRIN##32(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \ - __m128i v32 = _mm_##VINTRIN##32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \ - return (STYPE##32)_mm_cvtsi128_si32(v32); \ - } \ - NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m512i a) \ - { \ - __m512i v256 = _mm512_##VINTRIN##64(a, _mm512_shuffle_i64x2(a, a, _MM_SHUFFLE(0, 0, 3, 2))); \ - __m512i v128 = _mm512_##VINTRIN##64(v256, _mm512_shuffle_i64x2(v256, v256, _MM_SHUFFLE(0, 0, 0, 1))); \ - __m512i v64 = _mm512_##VINTRIN##64(v128, _mm512_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \ - return (STYPE##64)npyv_extract0_u64(v64); \ + #define NPY_IMPL_AVX512_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \ + NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m512i a) \ + { \ + __m256i v256 = _mm256_##VINTRIN##32(npyv512_lower_si256(a), \ + npyv512_higher_si256(a)); \ + __m128i v128 = _mm_##VINTRIN##32(_mm256_castsi256_si128(v256), \ + _mm256_extracti128_si256(v256, 1)); \ + __m128i v64 = _mm_##VINTRIN##32(v128, _mm_shuffle_epi32(v128, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = _mm_##VINTRIN##32(v64, _mm_shuffle_epi32(v64, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + return (STYPE##32)_mm_cvtsi128_si32(v32); \ + } \ + NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m512i a) \ + { \ + __m512i v256 = _mm512_##VINTRIN##64(a, \ + _mm512_shuffle_i64x2(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m512i v128 = _mm512_##VINTRIN##64(v256, \ + _mm512_shuffle_i64x2(v256, v256, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + __m512i v64 = _mm512_##VINTRIN##64(v128, \ + _mm512_shuffle_epi32(v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + return (STYPE##64)npyv_extract0_u64(v64); \ } NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, min_u, min_epu) @@ -174,21 +181,28 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, max_s, max_epi) #undef NPY_IMPL_AVX512_REDUCE_MINMAX // reduce min&max for ps & pd - #define NPY_IMPL_AVX512_REDUCE_MINMAX(INTRIN) \ - NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \ - { \ - __m256 v256 = _mm256_##INTRIN##_ps(npyv512_lower_ps256(a), npyv512_higher_ps256(a)); \ - __m128 v128 = _mm_##INTRIN##_ps(_mm256_castps256_ps128(v256), _mm256_extractf128_ps(v256, 1)); \ - __m128 v64 = _mm_##INTRIN##_ps(v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(0, 0, 3, 2))); \ - __m128 v32 = _mm_##INTRIN##_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); \ - return _mm_cvtss_f32(v32); \ - } \ - NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \ - { \ - __m256d v256 = _mm256_##INTRIN##_pd(npyv512_lower_pd256(a), npyv512_higher_pd256(a)); \ - __m128d v128 = _mm_##INTRIN##_pd(_mm256_castpd256_pd128(v256), _mm256_extractf128_pd(v256, 1)); \ - __m128d v64 = _mm_##INTRIN##_pd(v128, _mm_shuffle_pd(v128, v128, _MM_SHUFFLE(0, 0, 0, 1))); \ - return _mm_cvtsd_f64(v64); \ + #define NPY_IMPL_AVX512_REDUCE_MINMAX(INTRIN) \ + NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \ + { \ + __m256 v256 = _mm256_##INTRIN##_ps( \ + npyv512_lower_ps256(a), npyv512_higher_ps256(a)); \ + __m128 v128 = _mm_##INTRIN##_ps( \ + _mm256_castps256_ps128(v256), _mm256_extractf128_ps(v256, 1)); \ + __m128 v64 = _mm_##INTRIN##_ps(v128, \ + _mm_shuffle_ps(v128, v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m128 v32 = _mm_##INTRIN##_ps(v64, \ + _mm_shuffle_ps(v64, v64, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + return _mm_cvtss_f32(v32); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \ + { \ + __m256d v256 = _mm256_##INTRIN##_pd( \ + npyv512_lower_pd256(a), npyv512_higher_pd256(a)); \ + __m128d v128 = _mm_##INTRIN##_pd( \ + _mm256_castpd256_pd128(v256), _mm256_extractf128_pd(v256, 1)); \ + __m128d v64 = _mm_##INTRIN##_pd(v128, \ + _mm_shuffle_pd(v128, v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + return _mm_cvtsd_f64(v64); \ } NPY_IMPL_AVX512_REDUCE_MINMAX(min) @@ -199,7 +213,7 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ { \ npyv_b32 notnan = npyv_notnan_f32(a); \ - if (NPY_UNLIKELY(npyv_tobits_b32(notnan) == 0)) { \ + if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \ return _mm_cvtss_f32(_mm512_castps512_ps128(a)); \ } \ a = npyv_select_f32(notnan, a, \ @@ -209,7 +223,7 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \ { \ npyv_b64 notnan = npyv_notnan_f64(a); \ - if (NPY_UNLIKELY(npyv_tobits_b64(notnan) == 0)) { \ + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \ return _mm_cvtsd_f64(_mm512_castpd512_pd128(a)); \ } \ a = npyv_select_f64(notnan, a, \ @@ -219,7 +233,7 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ { \ npyv_b32 notnan = npyv_notnan_f32(a); \ - if (NPY_UNLIKELY(npyv_tobits_b32(notnan) != 0xffff)) { \ + if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \ const union { npy_uint32 i; float f;} pnan = { \ 0x7fc00000ul \ }; \ @@ -230,7 +244,7 @@ NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ { \ npyv_b64 notnan = npyv_notnan_f64(a); \ - if (NPY_UNLIKELY(npyv_tobits_b64(notnan) != 0xff)) { \ + if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \ const union { npy_uint64 i; double d;} pnan = { \ 0x7ff8000000000000ull \ }; \ @@ -249,18 +263,24 @@ NPY_IMPL_AVX512_REDUCE_MINMAX(max, 0xff800000, 0xfff0000000000000) { \ __m256i v256 = _mm256_##VINTRIN##16(npyv512_lower_si256(a), npyv512_higher_si256(a)); \ __m128i v128 = _mm_##VINTRIN##16(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \ - __m128i v64 = _mm_##VINTRIN##16(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \ - __m128i v32 = _mm_##VINTRIN##16(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \ - __m128i v16 = _mm_##VINTRIN##16(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v64 = _mm_##VINTRIN##16(v128, _mm_shuffle_epi32(v128, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = _mm_##VINTRIN##16(v64, _mm_shuffle_epi32(v64, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v16 = _mm_##VINTRIN##16(v32, _mm_shufflelo_epi16(v32, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ return (STYPE##16)_mm_cvtsi128_si32(v16); \ } \ NPY_FINLINE STYPE##8 npyv_reduce_##INTRIN##8(__m512i a) \ { \ __m256i v256 = _mm256_##VINTRIN##8(npyv512_lower_si256(a), npyv512_higher_si256(a)); \ __m128i v128 = _mm_##VINTRIN##8(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \ - __m128i v64 = _mm_##VINTRIN##8(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \ - __m128i v32 = _mm_##VINTRIN##8(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \ - __m128i v16 = _mm_##VINTRIN##8(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v64 = _mm_##VINTRIN##8(v128, _mm_shuffle_epi32(v128, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = _mm_##VINTRIN##8(v64, _mm_shuffle_epi32(v64, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v16 = _mm_##VINTRIN##8(v32, _mm_shufflelo_epi16(v32, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ __m128i v8 = _mm_##VINTRIN##8(v16, _mm_srli_epi16(v16, 8)); \ return (STYPE##16)_mm_cvtsi128_si32(v8); \ } diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h index 804cd24e8..c70932d5f 100644 --- a/numpy/core/src/common/simd/avx512/operators.h +++ b/numpy/core/src/common/simd/avx512/operators.h @@ -5,6 +5,8 @@ #ifndef _NPY_SIMD_AVX512_OPERATORS_H #define _NPY_SIMD_AVX512_OPERATORS_H +#include "conversion.h" // tobits + /*************************** * Shifting ***************************/ @@ -336,4 +338,43 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q); } +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#define NPYV_IMPL_AVX512_ANYALL(SFX, MASK) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return npyv_tobits_##SFX(a) != 0; } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return npyv_tobits_##SFX(a) == MASK; } +NPYV_IMPL_AVX512_ANYALL(b8, 0xffffffffffffffffull) +NPYV_IMPL_AVX512_ANYALL(b16, 0xfffffffful) +NPYV_IMPL_AVX512_ANYALL(b32, 0xffff) +NPYV_IMPL_AVX512_ANYALL(b64, 0xff) +#undef NPYV_IMPL_AVX512_ANYALL + +#define NPYV_IMPL_AVX512_ANYALL(SFX, BSFX, MASK) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + return npyv_tobits_##BSFX( \ + npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \ + ) != MASK; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + return npyv_tobits_##BSFX( \ + npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \ + ) == 0; \ + } +NPYV_IMPL_AVX512_ANYALL(u8, b8, 0xffffffffffffffffull) +NPYV_IMPL_AVX512_ANYALL(s8, b8, 0xffffffffffffffffull) +NPYV_IMPL_AVX512_ANYALL(u16, b16, 0xfffffffful) +NPYV_IMPL_AVX512_ANYALL(s16, b16, 0xfffffffful) +NPYV_IMPL_AVX512_ANYALL(u32, b32, 0xffff) +NPYV_IMPL_AVX512_ANYALL(s32, b32, 0xffff) +NPYV_IMPL_AVX512_ANYALL(u64, b64, 0xff) +NPYV_IMPL_AVX512_ANYALL(s64, b64, 0xff) +NPYV_IMPL_AVX512_ANYALL(f32, b32, 0xffff) +NPYV_IMPL_AVX512_ANYALL(f64, b64, 0xff) +#undef NPYV_IMPL_AVX512_ANYALL + #endif // _NPY_SIMD_AVX512_OPERATORS_H diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 0b4cf93f2..c0a771b5d 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -169,8 +169,6 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #define npyv_reduce_max_s16 vmaxvq_s16 #define npyv_reduce_max_u32 vmaxvq_u32 #define npyv_reduce_max_s32 vmaxvq_s32 - #define npyv_reduce_max_u64 vmaxvq_u64 - #define npyv_reduce_max_s64 vmaxvq_s64 #define npyv_reduce_max_f32 vmaxvq_f32 #define npyv_reduce_max_f64 vmaxvq_f64 @@ -185,8 +183,6 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #define npyv_reduce_min_s16 vminvq_s16 #define npyv_reduce_min_u32 vminvq_u32 #define npyv_reduce_min_s32 vminvq_s32 - #define npyv_reduce_min_u64 vminvq_u64 - #define npyv_reduce_min_s64 vminvq_s64 #define npyv_reduce_min_f32 vminvq_f32 #define npyv_reduce_min_f64 vminvq_f64 @@ -195,14 +191,14 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #define npyv_reduce_minp_f32 vminnmvq_f32 #define npyv_reduce_minp_f64 vminnmvq_f64 #else - #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ - NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ - { \ - STYPE##x8_t r = v##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \ - r = v##INTRIN##_##SFX(r, vrev64_##SFX(r)); \ - r = vp##INTRIN##_##SFX(r, r); \ - r = vp##INTRIN##_##SFX(r, r); \ - return (npy_##STYPE)vget_lane_##SFX(r, 0); \ + #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + STYPE##x8_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \ + r = vp##INTRIN##_##SFX(r, r); \ + r = vp##INTRIN##_##SFX(r, r); \ + r = vp##INTRIN##_##SFX(r, r); \ + return (npy_##STYPE)vget_lane_##SFX(r, 0); \ } NPY_IMPL_NEON_REDUCE_MINMAX(min, uint8, u8) NPY_IMPL_NEON_REDUCE_MINMAX(max, uint8, u8) @@ -210,13 +206,13 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) NPY_IMPL_NEON_REDUCE_MINMAX(max, int8, s8) #undef NPY_IMPL_NEON_REDUCE_MINMAX - #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ - NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ - { \ - STYPE##x4_t r = v##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \ - r = v##INTRIN##_##SFX(r, vrev64_##SFX(r)); \ - r = vp##INTRIN##_##SFX(r, r); \ - return (npy_##STYPE)vget_lane_##SFX(r, 0); \ + #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + STYPE##x4_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \ + r = vp##INTRIN##_##SFX(r, r); \ + r = vp##INTRIN##_##SFX(r, r); \ + return (npy_##STYPE)vget_lane_##SFX(r, 0); \ } NPY_IMPL_NEON_REDUCE_MINMAX(min, uint16, u16) NPY_IMPL_NEON_REDUCE_MINMAX(max, uint16, u16) @@ -224,12 +220,12 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) NPY_IMPL_NEON_REDUCE_MINMAX(max, int16, s16) #undef NPY_IMPL_NEON_REDUCE_MINMAX - #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ - NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ - { \ - STYPE##x2_t r = v##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \ - r = v##INTRIN##_##SFX(r, vrev64_##SFX(r)); \ - return (npy_##STYPE)vget_lane_##SFX(r, 0); \ + #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + STYPE##x2_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \ + r = vp##INTRIN##_##SFX(r, r); \ + return (npy_##STYPE)vget_lane_##SFX(r, 0); \ } NPY_IMPL_NEON_REDUCE_MINMAX(min, uint32, u32) NPY_IMPL_NEON_REDUCE_MINMAX(max, uint32, u32) @@ -237,31 +233,18 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) NPY_IMPL_NEON_REDUCE_MINMAX(max, int32, s32) #undef NPY_IMPL_NEON_REDUCE_MINMAX - #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, OP, STYPE, SFX) \ - NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ - { \ - npy_##STYPE a0 = (npy_##STYPE)vget_low_##SFX(a, 0); \ - npy_##STYPE a1 = (npy_##STYPE)vget_low_##SFX(a, 1); \ - return a0 OP a1 ? a0 : a1; \ - } - NPY_IMPL_NEON_REDUCE_MINMAX(min, <, uint64, u64) - NPY_IMPL_NEON_REDUCE_MINMAX(max, >, uint64, u64) - NPY_IMPL_NEON_REDUCE_MINMAX(min, <, int64, s64) - NPY_IMPL_NEON_REDUCE_MINMAX(max, >, int64, s64) - #undef NPY_IMPL_NEON_REDUCE_MINMAX - #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, INF) \ NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \ { \ - float32x2_t r = v##INTRIN##_f32(vget_low_f32(a), vget_high_f32(a)); \ - r = v##INTRIN##_f32(r, vrev64_f32(r)); \ + float32x2_t r = vp##INTRIN##_f32(vget_low_f32(a), vget_high_f32(a));\ + r = vp##INTRIN##_f32(r, r); \ return vget_lane_f32(r, 0); \ } \ NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ { \ npyv_b32 notnan = npyv_notnan_f32(a); \ - if (vget_lane_u32(notnan, 0) != 0) { \ - return vget_lane_f32(a, 0); \ + if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \ + return vgetq_lane_f32(a, 0); \ } \ a = npyv_select_f32(notnan, a, \ npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \ @@ -274,7 +257,19 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) NPY_IMPL_NEON_REDUCE_MINMAX(min, 0x7f800000) NPY_IMPL_NEON_REDUCE_MINMAX(max, 0xff800000) #undef NPY_IMPL_NEON_REDUCE_MINMAX -#endif +#endif // NPY_SIMD_F64 +#define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX, OP) \ + NPY_FINLINE STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + STYPE al = (STYPE)vget_low_##SFX(a); \ + STYPE ah = (STYPE)vget_high_##SFX(a); \ + return al OP ah ? al : ah; \ + } +NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_uint64, u64, >) +NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_int64, s64, >) +NPY_IMPL_NEON_REDUCE_MINMAX(min, npy_uint64, u64, <) +NPY_IMPL_NEON_REDUCE_MINMAX(min, npy_int64, s64, <) +#undef NPY_IMPL_NEON_REDUCE_MINMAX // round to nearest integer even NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) diff --git a/numpy/core/src/common/simd/neon/misc.h b/numpy/core/src/common/simd/neon/misc.h index f1f0a5cc5..5fe109c13 100644 --- a/numpy/core/src/common/simd/neon/misc.h +++ b/numpy/core/src/common/simd/neon/misc.h @@ -139,16 +139,16 @@ NPY_FINLINE float64x2_t npyv__set_f64(double i0, double i1) #define npyv_select_f64 vbslq_f64 // extract the first vector's lane -#define npyv_extract0_u8(A) ((npy_uint8)vget_lane_u8((A, 0)) -#define npyv_extract0_s8(A) ((npy_int8)vget_lane_s8((A, 0)) -#define npyv_extract0_u16(A) ((npy_uint16)vget_lane_u16((A, 0)) -#define npyv_extract0_s16(A) ((npy_int16)vget_lane_s16((A, 0)) -#define npyv_extract0_u32(A) ((npy_uint32)vget_lane_u32((A, 0)) -#define npyv_extract0_s32(A) ((npy_int32)vget_lane_s32((A, 0)) -#define npyv_extract0_u64(A) ((npy_uint64)vget_lane_u64((A, 0)) -#define npyv_extract0_s64(A) ((npy_int64)vget_lane_s64((A, 0)) -#define npyv_extract0_f32(A) vget_lane_f32(A, 0) -#define npyv_extract0_f64(A) vget_lane_f64(A, 0) +#define npyv_extract0_u8(A) ((npy_uint8)vgetq_lane_u8(A, 0)) +#define npyv_extract0_s8(A) ((npy_int8)vgetq_lane_s8(A, 0)) +#define npyv_extract0_u16(A) ((npy_uint16)vgetq_lane_u16(A, 0)) +#define npyv_extract0_s16(A) ((npy_int16)vgetq_lane_s16(A, 0)) +#define npyv_extract0_u32(A) ((npy_uint32)vgetq_lane_u32(A, 0)) +#define npyv_extract0_s32(A) ((npy_int32)vgetq_lane_s32(A, 0)) +#define npyv_extract0_u64(A) ((npy_uint64)vgetq_lane_u64(A, 0)) +#define npyv_extract0_s64(A) ((npy_int64)vgetq_lane_s64(A, 0)) +#define npyv_extract0_f32(A) vgetq_lane_f32(A, 0) +#define npyv_extract0_f64(A) vgetq_lane_f64(A, 0) // Reinterpret #define npyv_reinterpret_u8_u8(X) X diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index a08fa5390..249621bd6 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -246,4 +246,129 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) { return vceqq_f64(a, a); } #endif +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#if NPY_SIMD_F64 + #define NPYV_IMPL_NEON_ANYALL(LEN) \ + NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a) \ + { return vmaxvq_u##LEN(a) != 0; } \ + NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a) \ + { return vminvq_u##LEN(a) != 0; } + NPYV_IMPL_NEON_ANYALL(8) + NPYV_IMPL_NEON_ANYALL(16) + NPYV_IMPL_NEON_ANYALL(32) + #undef NPYV_IMPL_NEON_ANYALL + + #define NPYV_IMPL_NEON_ANYALL(SFX, USFX, BSFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return npyv_any_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return npyv_all_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); } + NPYV_IMPL_NEON_ANYALL(u8, u8, b8) + NPYV_IMPL_NEON_ANYALL(s8, u8, b8) + NPYV_IMPL_NEON_ANYALL(u16, u16, b16) + NPYV_IMPL_NEON_ANYALL(s16, u16, b16) + NPYV_IMPL_NEON_ANYALL(u32, u32, b32) + NPYV_IMPL_NEON_ANYALL(s32, u32, b32) + #undef NPYV_IMPL_NEON_ANYALL + + NPY_FINLINE bool npyv_any_b64(npyv_b64 a) + { return vmaxvq_u32(vreinterpretq_u32_u64(a)) != 0; } + NPY_FINLINE bool npyv_all_b64(npyv_b64 a) + { return vminvq_u32(vreinterpretq_u32_u64(a)) != 0; } + #define npyv_any_u64 npyv_any_b64 + NPY_FINLINE bool npyv_all_u64(npyv_u64 a) + { + uint32x4_t a32 = vreinterpretq_u32_u64(a); + a32 = vorrq_u32(a32, vrev64q_u32(a32)); + return vminvq_u32(a32) != 0; + } + NPY_FINLINE bool npyv_any_s64(npyv_s64 a) + { return npyv_any_u64(vreinterpretq_u64_s64(a)); } + NPY_FINLINE bool npyv_all_s64(npyv_s64 a) + { return npyv_all_u64(vreinterpretq_u64_s64(a)); } + + #define NPYV_IMPL_NEON_ANYALL(SFX, BSFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return !npyv_all_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return !npyv_any_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); } + NPYV_IMPL_NEON_ANYALL(f32, b32) + NPYV_IMPL_NEON_ANYALL(f64, b64) + #undef NPYV_IMPL_NEON_ANYALL +#else + #define NPYV_IMPL_NEON_ANYALL(LEN) \ + NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a) \ + { \ + int64x2_t a64 = vreinterpretq_s64_u##LEN(a); \ + return ( \ + vgetq_lane_s64(a64, 0) | \ + vgetq_lane_s64(a64, 1) \ + ) != 0; \ + } \ + NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a) \ + { \ + int64x2_t a64 = vreinterpretq_s64_u##LEN(a); \ + return ( \ + vgetq_lane_s64(a64, 0) & \ + vgetq_lane_s64(a64, 1) \ + ) == -1; \ + } + NPYV_IMPL_NEON_ANYALL(8) + NPYV_IMPL_NEON_ANYALL(16) + NPYV_IMPL_NEON_ANYALL(32) + NPYV_IMPL_NEON_ANYALL(64) + #undef NPYV_IMPL_NEON_ANYALL + + #define NPYV_IMPL_NEON_ANYALL(SFX, USFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + int64x2_t a64 = vreinterpretq_s64_##SFX(a); \ + return ( \ + vgetq_lane_s64(a64, 0) | \ + vgetq_lane_s64(a64, 1) \ + ) != 0; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + npyv_##USFX tz = npyv_cmpeq_##SFX( \ + a, npyv_zero_##SFX() \ + ); \ + int64x2_t a64 = vreinterpretq_s64_##USFX(tz); \ + return ( \ + vgetq_lane_s64(a64, 0) | \ + vgetq_lane_s64(a64, 1) \ + ) == 0; \ + } + NPYV_IMPL_NEON_ANYALL(u8, u8) + NPYV_IMPL_NEON_ANYALL(s8, u8) + NPYV_IMPL_NEON_ANYALL(u16, u16) + NPYV_IMPL_NEON_ANYALL(s16, u16) + NPYV_IMPL_NEON_ANYALL(u32, u32) + NPYV_IMPL_NEON_ANYALL(s32, u32) + #undef NPYV_IMPL_NEON_ANYALL + + NPY_FINLINE bool npyv_any_f32(npyv_f32 a) + { + uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32()); + int64x2_t a64 = vreinterpretq_s64_u32(tz); + return (vgetq_lane_s64(a64, 0) & vgetq_lane_s64(a64, 1)) != -1ll; + } + NPY_FINLINE bool npyv_all_f32(npyv_f32 a) + { + uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32()); + int64x2_t a64 = vreinterpretq_s64_u32(tz); + return (vgetq_lane_s64(a64, 0) | vgetq_lane_s64(a64, 1)) == 0; + } + NPY_FINLINE bool npyv_any_s64(npyv_s64 a) + { return (vgetq_lane_s64(a, 0) | vgetq_lane_s64(a, 1)) != 0; } + NPY_FINLINE bool npyv_all_s64(npyv_s64 a) + { return vgetq_lane_s64(a, 0) && vgetq_lane_s64(a, 1); } + NPY_FINLINE bool npyv_any_u64(npyv_u64 a) + { return (vgetq_lane_u64(a, 0) | vgetq_lane_u64(a, 1)) != 0; } + NPY_FINLINE bool npyv_all_u64(npyv_u64 a) + { return vgetq_lane_u64(a, 0) && vgetq_lane_u64(a, 1); } +#endif // NPY_SIMD_F64 + #endif // _NPY_SIMD_NEON_OPERATORS_H diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index b1492500f..92a77ad80 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -8,6 +8,10 @@ * TODO: Add an independent sphinx doc. */ #include "numpy/npy_common.h" +#ifndef __cplusplus + #include <stdbool.h> +#endif + #include "npy_cpu_dispatch.h" #include "simd_utils.h" diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index 83cfdd18b..b7f8e6ebb 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -202,7 +202,7 @@ NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi) NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ { \ npyv_b32 notnan = npyv_notnan_f32(a); \ - if (NPY_UNLIKELY(npyv_tobits_b32(notnan) == 0)) { \ + if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \ return _mm_cvtss_f32(a); \ } \ a = npyv_select_f32(notnan, a, npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \ @@ -211,7 +211,7 @@ NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi) NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \ { \ npyv_b64 notnan = npyv_notnan_f64(a); \ - if (NPY_UNLIKELY(npyv_tobits_b64(notnan) == 0)) { \ + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \ return _mm_cvtsd_f64(a); \ } \ a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \ @@ -220,7 +220,7 @@ NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi) NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ { \ npyv_b32 notnan = npyv_notnan_f32(a); \ - if (NPY_UNLIKELY(npyv_tobits_b32(notnan) != 0xf)) { \ + if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \ const union { npy_uint32 i; float f;} pnan = {0x7fc00000UL}; \ return pnan.f; \ } \ @@ -229,7 +229,7 @@ NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi) NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ { \ npyv_b64 notnan = npyv_notnan_f64(a); \ - if (NPY_UNLIKELY(npyv_tobits_b64(notnan) != 0x3)) { \ + if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \ const union { npy_uint64 i; double d;} pnan = {0x7ff8000000000000ull}; \ return pnan.d; \ } \ diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h index 86dbcfea5..59182679e 100644 --- a/numpy/core/src/common/simd/sse/operators.h +++ b/numpy/core/src/common/simd/sse/operators.h @@ -283,4 +283,60 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return _mm_castpd_si128(_mm_cmpord_pd(a, a)); } +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#define NPYV_IMPL_SSE_ANYALL(SFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return _mm_movemask_epi8(a) != 0; } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return _mm_movemask_epi8(a) == 0xffff; } +NPYV_IMPL_SSE_ANYALL(b8) +NPYV_IMPL_SSE_ANYALL(b16) +NPYV_IMPL_SSE_ANYALL(b32) +NPYV_IMPL_SSE_ANYALL(b64) +#undef NPYV_IMPL_SSE_ANYALL + +#define NPYV_IMPL_SSE_ANYALL(SFX, MSFX, TSFX, MASK) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + return _mm_movemask_##MSFX( \ + _mm_cmpeq_##TSFX(a, npyv_zero_##SFX()) \ + ) != MASK; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + return _mm_movemask_##MSFX( \ + _mm_cmpeq_##TSFX(a, npyv_zero_##SFX()) \ + ) == 0; \ + } +NPYV_IMPL_SSE_ANYALL(u8, epi8, epi8, 0xffff) +NPYV_IMPL_SSE_ANYALL(s8, epi8, epi8, 0xffff) +NPYV_IMPL_SSE_ANYALL(u16, epi8, epi16, 0xffff) +NPYV_IMPL_SSE_ANYALL(s16, epi8, epi16, 0xffff) +NPYV_IMPL_SSE_ANYALL(u32, epi8, epi32, 0xffff) +NPYV_IMPL_SSE_ANYALL(s32, epi8, epi32, 0xffff) +#ifdef NPY_HAVE_SSE41 + NPYV_IMPL_SSE_ANYALL(u64, epi8, epi64, 0xffff) + NPYV_IMPL_SSE_ANYALL(s64, epi8, epi64, 0xffff) +#else + NPY_FINLINE bool npyv_any_u64(npyv_u64 a) + { + return _mm_movemask_epi8( + _mm_cmpeq_epi32(a, npyv_zero_u64()) + ) != 0xffff; + } + NPY_FINLINE bool npyv_all_u64(npyv_u64 a) + { + a = _mm_cmpeq_epi32(a, npyv_zero_u64()); + a = _mm_and_si128(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1))); + return _mm_movemask_epi8(a) == 0; + } + #define npyv_any_s64 npyv_any_u64 + #define npyv_all_s64 npyv_all_u64 +#endif +NPYV_IMPL_SSE_ANYALL(f32, ps, ps, 0xf) +NPYV_IMPL_SSE_ANYALL(f64, pd, pd, 0x3) +#undef NPYV_IMPL_SSE_ANYALL + #endif // _NPY_SIMD_SSE_OPERATORS_H diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h index 7ef529e21..95b16fdf7 100644 --- a/numpy/core/src/common/simd/vec/math.h +++ b/numpy/core/src/common/simd/vec/math.h @@ -69,7 +69,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) npyv_b32 nn_a = npyv_notnan_f32(a); npyv_b32 nn_b = npyv_notnan_f32(b); npyv_f32 max = vec_max(a, b); - return vec_sel(a, vec_sel(b, max, nn_b), nn_a); + return vec_sel(b, vec_sel(a, max, nn_a), nn_b); } #endif NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) @@ -77,7 +77,7 @@ NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) npyv_b64 nn_a = npyv_notnan_f64(a); npyv_b64 nn_b = npyv_notnan_f64(b); npyv_f64 max = vec_max(a, b); - return vec_sel(a, vec_sel(b, max, nn_b), nn_a); + return vec_sel(b, vec_sel(a, max, nn_a), nn_b); } // Maximum, integer operations @@ -118,15 +118,15 @@ NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) npyv_b32 nn_a = npyv_notnan_f32(a); npyv_b32 nn_b = npyv_notnan_f32(b); npyv_f32 min = vec_min(a, b); - return vec_sel(a, vec_sel(b, min, nn_b), nn_a); + return vec_sel(b, vec_sel(a, min, nn_a), nn_b); } #endif NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) { - npyv_b32 nn_a = npyv_notnan_f64(a); - npyv_b32 nn_b = npyv_notnan_f64(b); + npyv_b64 nn_a = npyv_notnan_f64(a); + npyv_b64 nn_b = npyv_notnan_f64(b); npyv_f64 min = vec_min(a, b); - return vec_sel(a, vec_sel(b, min, nn_b), nn_a); + return vec_sel(b, vec_sel(a, min, nn_a), nn_b); } // Minimum, integer operations @@ -208,7 +208,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64) NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ { \ npyv_b32 notnan = npyv_notnan_f32(a); \ - if (NPY_UNLIKELY(!vec_all(notnan))) { \ + if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \ const union { npy_uint32 i; float f;} \ pnan = {0x7fc00000UL}; \ return pnan.f; \ @@ -226,14 +226,10 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64) npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ return vec_extract(r, 0); \ } \ - NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \ - { \ - return npyv_reduce_##INTRIN##_f64(a); \ - } \ NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ { \ npyv_b64 notnan = npyv_notnan_f64(a); \ - if (NPY_UNLIKELY(!vec_all(notnan))) { \ + if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \ const union { npy_uint64 i; double f;} \ pnan = {0x7ff8000000000000ull}; \ return pnan.f; \ @@ -244,6 +240,31 @@ NPY_IMPL_VEC_REDUCE_MINMAX(min, 0x7ff0000000000000) NPY_IMPL_VEC_REDUCE_MINMAX(max, 0xfff0000000000000) #undef NPY_IMPL_VEC_REDUCE_MINMAX +#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX) + #define npyv_reduce_minp_f64 npyv_reduce_min_f64 + #define npyv_reduce_maxp_f64 npyv_reduce_max_f64 +#else + NPY_FINLINE double npyv_reduce_minp_f64(npyv_f64 a) + { + npyv_b64 notnan = npyv_notnan_f64(a); + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { + return vec_extract(a, 0); + } + a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64( + npyv_setall_u64(0x7ff0000000000000))); + return npyv_reduce_min_f64(a); + } + NPY_FINLINE double npyv_reduce_maxp_f64(npyv_f64 a) + { + npyv_b64 notnan = npyv_notnan_f64(a); + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { + return vec_extract(a, 0); + } + a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64( + npyv_setall_u64(0xfff0000000000000))); + return npyv_reduce_max_f64(a); + } +#endif // round to nearest int even #define npyv_rint_f64 vec_rint // ceil diff --git a/numpy/core/src/common/simd/vec/operators.h b/numpy/core/src/common/simd/vec/operators.h index 8b58676e7..50dac20f6 100644 --- a/numpy/core/src/common/simd/vec/operators.h +++ b/numpy/core/src/common/simd/vec/operators.h @@ -274,4 +274,30 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return vec_cmpeq(a, a); } +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#define NPYV_IMPL_VEC_ANYALL(SFX, SFX2) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return vec_any_ne(a, (npyv_##SFX)npyv_zero_##SFX2()); } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return vec_all_ne(a, (npyv_##SFX)npyv_zero_##SFX2()); } +NPYV_IMPL_VEC_ANYALL(b8, u8) +NPYV_IMPL_VEC_ANYALL(b16, u16) +NPYV_IMPL_VEC_ANYALL(b32, u32) +NPYV_IMPL_VEC_ANYALL(b64, u64) +NPYV_IMPL_VEC_ANYALL(u8, u8) +NPYV_IMPL_VEC_ANYALL(s8, s8) +NPYV_IMPL_VEC_ANYALL(u16, u16) +NPYV_IMPL_VEC_ANYALL(s16, s16) +NPYV_IMPL_VEC_ANYALL(u32, u32) +NPYV_IMPL_VEC_ANYALL(s32, s32) +NPYV_IMPL_VEC_ANYALL(u64, u64) +NPYV_IMPL_VEC_ANYALL(s64, s64) +#if NPY_SIMD_F32 + NPYV_IMPL_VEC_ANYALL(f32, f32) +#endif +NPYV_IMPL_VEC_ANYALL(f64, f64) +#undef NPYV_IMPL_VEC_ANYALL + #endif // _NPY_SIMD_VEC_OPERATORS_H |
