diff options
| author | Rafael CF Sousa <rafaelcfsousa@ibm.com> | 2022-05-27 10:17:29 -0300 |
|---|---|---|
| committer | Rafael CF Sousa <rafaelcfsousa@ibm.com> | 2022-05-29 21:21:34 -0300 |
| commit | d5d6eb567ec228bcc65da184240349239db4f080 (patch) | |
| tree | 7976bdc6bc0c4b2232d5fa2dc8ddd926a07f12aa /numpy/core/src/common | |
| parent | 09b22a118466ff85fe365f451139cc3da2e8bc43 (diff) | |
| download | numpy-d5d6eb567ec228bcc65da184240349239db4f080.tar.gz | |
SIMD, ENH: Add universal intrinsic andc8 and use it to remove ifneq
This commit also applies some techniques to reduce the size of the binary generated from the source loops_comparison.dispatch.c.src
Diffstat (limited to 'numpy/core/src/common')
| -rw-r--r-- | numpy/core/src/common/simd/avx2/operators.h | 7 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/conversion.h | 4 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/operators.h | 15 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/neon/operators.h | 5 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/operators.h | 7 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vsx/operators.h | 5 |
6 files changed, 25 insertions, 18 deletions
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index 0e77fc6be..99ef76dcb 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -115,9 +115,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c) #define npyv_not_b64 npyv_not_u8 // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B) -#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) -#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) +#define npyv_andc_u8(A, B) _mm256_andnot_si256(B, A) +#define npyv_andc_b8(A, B) _mm256_andnot_si256(B, A) +#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) +#define npyv_xnor_b8 _mm256_cmpeq_epi8 /*************************** * Comparison diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h index a2f56b2ae..474aee446 100644 --- a/numpy/core/src/common/simd/avx512/conversion.h +++ b/numpy/core/src/common/simd/avx512/conversion.h @@ -104,8 +104,8 @@ NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { #ifdef NPY_HAVE_AVX512BW - __mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a); - __mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c); + __mmask32 ab = _mm512_kunpackw((__mmask32)b, (__mmask32)a); + __mmask32 cd = _mm512_kunpackw((__mmask32)d, (__mmask32)c); return npyv_pack_b8_b16(ab, cd); #else const __m512i idx = _mm512_setr_epi32( diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h index 8c98b72dd..b856b345a 100644 --- a/numpy/core/src/common/simd/avx512/operators.h +++ b/numpy/core/src/common/simd/avx512/operators.h @@ -140,6 +140,9 @@ #define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A))) #endif +// ANDC +#define npyv_andc_u8(A, B) _mm512_andnot_si512(B, A) + /*************************** * Logical (boolean) ***************************/ @@ -152,8 +155,8 @@ #define npyv_xor_b16 _kxor_mask32 #define npyv_not_b8 _knot_mask64 #define npyv_not_b16 _knot_mask32 - #define npyv_andc_b8 _kandn_mask64 - #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) + #define npyv_andc_b8(A, B) _kandn_mask64(B, A) + #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) #define npyv_xnor_b8 _kxnor_mask64 #elif defined(NPY_HAVE_AVX512BW) NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b) @@ -173,9 +176,9 @@ NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a) { return ~a; } NPY_FINLINE npyv_b8 npyv_andc_b8(npyv_b8 a, npyv_b8 b) - { return (~a) & b; } + { return a & (~b); } NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b) - { return (~a) | b; } + { return a | (~b); } NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b) { return ~(a ^ b); } #else @@ -187,8 +190,8 @@ #define npyv_xor_b16 _mm512_xor_si512 #define npyv_not_b8 npyv_not_u8 #define npyv_not_b16 npyv_not_u8 - #define npyv_andc_b8 _mm512_andnot_si512 - #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) + #define npyv_andc_b8(A, B) _mm512_andnot_si512(B, A) + #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) #endif diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index 6c155fc67..a08fa5390 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -117,8 +117,9 @@ #define npyv_not_b64 npyv_not_u64 // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) vbicq_u8(B, A) -#define npyv_orc_b8(A, B) vornq_u8(B, A) +#define npyv_andc_u8 vbicq_u8 +#define npyv_andc_b8 vbicq_u8 +#define npyv_orc_b8 vornq_u8 #define npyv_xnor_b8 vceqq_u8 /*************************** diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h index 51bdca356..86dbcfea5 100644 --- a/numpy/core/src/common/simd/sse/operators.h +++ b/numpy/core/src/common/simd/sse/operators.h @@ -116,9 +116,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_not_b64 npyv_not_u8 // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) _mm_andnot_si128(A, B) -#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) -#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) +#define npyv_andc_u8(A, B) _mm_andnot_si128(B, A) +#define npyv_andc_b8(A, B) _mm_andnot_si128(B, A) +#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) +#define npyv_xnor_b8 _mm_cmpeq_epi8 /*************************** * Comparison diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h index fc29ba920..b01d85321 100644 --- a/numpy/core/src/common/simd/vsx/operators.h +++ b/numpy/core/src/common/simd/vsx/operators.h @@ -134,8 +134,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) { return vec_nor(a, a); } // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) vec_andc(B, A) -#define npyv_orc_b8(A, B) vec_orc(B, A) +#define npyv_andc_u8 vec_andc +#define npyv_andc_b8 vec_andc +#define npyv_orc_b8 vec_orc #define npyv_xnor_b8 vec_eqv /*************************** |
