summaryrefslogtreecommitdiff
path: root/numpy/core/src/common
diff options
context:
space:
mode:
authorRafael CF Sousa <rafaelcfsousa@ibm.com>2022-05-27 10:17:29 -0300
committerRafael CF Sousa <rafaelcfsousa@ibm.com>2022-05-29 21:21:34 -0300
commitd5d6eb567ec228bcc65da184240349239db4f080 (patch)
tree7976bdc6bc0c4b2232d5fa2dc8ddd926a07f12aa /numpy/core/src/common
parent09b22a118466ff85fe365f451139cc3da2e8bc43 (diff)
downloadnumpy-d5d6eb567ec228bcc65da184240349239db4f080.tar.gz
SIMD, ENH: Add universal intrinsic andc8 and use it to remove ifneq
This commit also applies some techniques to reduce the size of the binary generated from the source loops_comparison.dispatch.c.src
Diffstat (limited to 'numpy/core/src/common')
-rw-r--r--numpy/core/src/common/simd/avx2/operators.h7
-rw-r--r--numpy/core/src/common/simd/avx512/conversion.h4
-rw-r--r--numpy/core/src/common/simd/avx512/operators.h15
-rw-r--r--numpy/core/src/common/simd/neon/operators.h5
-rw-r--r--numpy/core/src/common/simd/sse/operators.h7
-rw-r--r--numpy/core/src/common/simd/vsx/operators.h5
6 files changed, 25 insertions, 18 deletions
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index 0e77fc6be..99ef76dcb 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -115,9 +115,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_not_b64 npyv_not_u8
// ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B)
-#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
-#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
+#define npyv_andc_u8(A, B) _mm256_andnot_si256(B, A)
+#define npyv_andc_b8(A, B) _mm256_andnot_si256(B, A)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+#define npyv_xnor_b8 _mm256_cmpeq_epi8
/***************************
* Comparison
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index a2f56b2ae..474aee446 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -104,8 +104,8 @@ NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
#ifdef NPY_HAVE_AVX512BW
- __mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a);
- __mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c);
+ __mmask32 ab = _mm512_kunpackw((__mmask32)b, (__mmask32)a);
+ __mmask32 cd = _mm512_kunpackw((__mmask32)d, (__mmask32)c);
return npyv_pack_b8_b16(ab, cd);
#else
const __m512i idx = _mm512_setr_epi32(
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index 8c98b72dd..b856b345a 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -140,6 +140,9 @@
#define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
#endif
+// ANDC
+#define npyv_andc_u8(A, B) _mm512_andnot_si512(B, A)
+
/***************************
* Logical (boolean)
***************************/
@@ -152,8 +155,8 @@
#define npyv_xor_b16 _kxor_mask32
#define npyv_not_b8 _knot_mask64
#define npyv_not_b16 _knot_mask32
- #define npyv_andc_b8 _kandn_mask64
- #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+ #define npyv_andc_b8(A, B) _kandn_mask64(B, A)
+ #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
#define npyv_xnor_b8 _kxnor_mask64
#elif defined(NPY_HAVE_AVX512BW)
NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b)
@@ -173,9 +176,9 @@
NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a)
{ return ~a; }
NPY_FINLINE npyv_b8 npyv_andc_b8(npyv_b8 a, npyv_b8 b)
- { return (~a) & b; }
+ { return a & (~b); }
NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b)
- { return (~a) | b; }
+ { return a | (~b); }
NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b)
{ return ~(a ^ b); }
#else
@@ -187,8 +190,8 @@
#define npyv_xor_b16 _mm512_xor_si512
#define npyv_not_b8 npyv_not_u8
#define npyv_not_b16 npyv_not_u8
- #define npyv_andc_b8 _mm512_andnot_si512
- #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+ #define npyv_andc_b8(A, B) _mm512_andnot_si512(B, A)
+ #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
#endif
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index 6c155fc67..a08fa5390 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -117,8 +117,9 @@
#define npyv_not_b64 npyv_not_u64
// ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) vbicq_u8(B, A)
-#define npyv_orc_b8(A, B) vornq_u8(B, A)
+#define npyv_andc_u8 vbicq_u8
+#define npyv_andc_b8 vbicq_u8
+#define npyv_orc_b8 vornq_u8
#define npyv_xnor_b8 vceqq_u8
/***************************
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 51bdca356..86dbcfea5 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -116,9 +116,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
#define npyv_not_b64 npyv_not_u8
// ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) _mm_andnot_si128(A, B)
-#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
-#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
+#define npyv_andc_u8(A, B) _mm_andnot_si128(B, A)
+#define npyv_andc_b8(A, B) _mm_andnot_si128(B, A)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+#define npyv_xnor_b8 _mm_cmpeq_epi8
/***************************
* Comparison
diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h
index fc29ba920..b01d85321 100644
--- a/numpy/core/src/common/simd/vsx/operators.h
+++ b/numpy/core/src/common/simd/vsx/operators.h
@@ -134,8 +134,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
{ return vec_nor(a, a); }
// ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) vec_andc(B, A)
-#define npyv_orc_b8(A, B) vec_orc(B, A)
+#define npyv_andc_u8 vec_andc
+#define npyv_andc_b8 vec_andc
+#define npyv_orc_b8 vec_orc
#define npyv_xnor_b8 vec_eqv
/***************************