summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2022-05-30 18:28:52 +0300
committerGitHub <noreply@github.com>2022-05-30 18:28:52 +0300
commitc8de16e440e778f4659671422ae78a6cdf7bf089 (patch)
tree25dab479f2a4ccba19d8ff8c8d0f78d089e0c37c /numpy
parentacfc1cc1d177e06002d87a10185dda1ef3192662 (diff)
parent2701a5a38e1521e5dd66e343c81bf075c0c30d17 (diff)
downloadnumpy-c8de16e440e778f4659671422ae78a6cdf7bf089.tar.gz
Merge pull request #21483 from rafaelcfsousa/simd_comparison
SIMD: Use universal intrinsics to implement comparison functions
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/code_generators/generate_umath.py12
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src25
-rw-r--r--numpy/core/src/_simd/_simd_easyintrin.inc44
-rw-r--r--numpy/core/src/common/simd/avx2/conversion.h30
-rw-r--r--numpy/core/src/common/simd/avx2/operators.h6
-rw-r--r--numpy/core/src/common/simd/avx512/conversion.h42
-rw-r--r--numpy/core/src/common/simd/avx512/operators.h15
-rw-r--r--numpy/core/src/common/simd/avx512/utils.h12
-rw-r--r--numpy/core/src/common/simd/neon/conversion.h24
-rw-r--r--numpy/core/src/common/simd/neon/operators.h6
-rw-r--r--numpy/core/src/common/simd/sse/conversion.h24
-rw-r--r--numpy/core/src/common/simd/sse/operators.h6
-rw-r--r--numpy/core/src/common/simd/vsx/conversion.h23
-rw-r--r--numpy/core/src/common/simd/vsx/operators.h6
-rw-r--r--numpy/core/src/umath/loops.c.src53
-rw-r--r--numpy/core/src/umath/loops.h.src83
-rw-r--r--numpy/core/src/umath/loops_comparison.dispatch.c.src451
-rw-r--r--numpy/core/src/umath/simd.inc.src144
-rw-r--r--numpy/core/tests/test_simd.py56
-rw-r--r--numpy/core/tests/test_umath.py46
21 files changed, 909 insertions, 200 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 266fccefb..cc0e93d43 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -445,7 +445,7 @@ defdict = {
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
- TD(all, out='?', simd=[('avx2', ints)]),
+ TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
@@ -453,7 +453,7 @@ defdict = {
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
- TD(all, out='?', simd=[('avx2', ints)]),
+ TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
@@ -461,7 +461,7 @@ defdict = {
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
- TD(all, out='?', simd=[('avx2', ints)]),
+ TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
@@ -469,7 +469,7 @@ defdict = {
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
- TD(all, out='?', simd=[('avx2', ints)]),
+ TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
@@ -477,7 +477,7 @@ defdict = {
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
- TD(all, out='?', simd=[('avx2', ints)]),
+ TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
@@ -485,7 +485,7 @@ defdict = {
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.not_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
- TD(all, out='?', simd=[('avx2', ints)]),
+ TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 10c1c3f92..7d072c15c 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -1068,6 +1068,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'),
join('src', 'umath', 'loops_modulo.dispatch.c.src'),
+ join('src', 'umath', 'loops_comparison.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h'),
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index ab48db5b1..0f3e4fc8f 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -31,6 +31,7 @@
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
+ * #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0#
*/
#if @simd_sup@
/***************************
@@ -332,6 +333,13 @@ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
/**end repeat1**/
+#if @bitw8b_sup@
+SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_2(andc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_2(orc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_2(xnor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+#endif
+
/***************************
* Conversion
***************************/
@@ -472,6 +480,10 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
/**end repeat**/
+SIMD_IMPL_INTRIN_2(pack_b8_b16, vb8, vb16, vb16)
+SIMD_IMPL_INTRIN_4(pack_b8_b32, vb8, vb32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_8(pack_b8_b64, vb8, vb64, vb64, vb64, vb64,
+ vb64, vb64, vb64, vb64)
//#########################################################################
//## Attach module functions
@@ -496,6 +508,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
+ * #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0#
*/
#if @simd_sup@
@@ -577,6 +590,13 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
+#if @bitw8b_sup@
+SIMD_INTRIN_DEF(andc_@sfx@)
+SIMD_INTRIN_DEF(andc_@bsfx@)
+SIMD_INTRIN_DEF(orc_@bsfx@)
+SIMD_INTRIN_DEF(xnor_@bsfx@)
+#endif
+
/***************************
* Conversion
***************************/
@@ -716,6 +736,11 @@ SIMD_INTRIN_DEF(not_@bsfx@)
SIMD_INTRIN_DEF(tobits_@bsfx@)
/**end repeat**/
+// Pack multiple vectors into one
+SIMD_INTRIN_DEF(pack_b8_b16)
+SIMD_INTRIN_DEF(pack_b8_b32)
+SIMD_INTRIN_DEF(pack_b8_b64)
+
/************************************************************************/
{NULL, NULL, 0, NULL}
}; // PyMethodDef
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
index 4521b2d87..f2e0da26e 100644
--- a/numpy/core/src/_simd/_simd_easyintrin.inc
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -153,6 +153,50 @@
return simd_arg_to_obj(&ret); \
}
+#define SIMD_IMPL_INTRIN_8(NAME, RET, IN0, IN1, IN2, IN3, \
+ IN4, IN5, IN6, IN7) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg arg1 = {.dtype = simd_data_##IN0}; \
+ simd_arg arg2 = {.dtype = simd_data_##IN1}; \
+ simd_arg arg3 = {.dtype = simd_data_##IN2}; \
+ simd_arg arg4 = {.dtype = simd_data_##IN3}; \
+ simd_arg arg5 = {.dtype = simd_data_##IN4}; \
+ simd_arg arg6 = {.dtype = simd_data_##IN5}; \
+ simd_arg arg7 = {.dtype = simd_data_##IN6}; \
+ simd_arg arg8 = {.dtype = simd_data_##IN7}; \
+ if (!PyArg_ParseTuple( \
+ args, "O&O&O&O&O&O&O&O&:"NPY_TOSTRING(NAME), \
+ simd_arg_converter, &arg1, \
+ simd_arg_converter, &arg2, \
+ simd_arg_converter, &arg3, \
+ simd_arg_converter, &arg4, \
+ simd_arg_converter, &arg5, \
+ simd_arg_converter, &arg6, \
+ simd_arg_converter, &arg7, \
+ simd_arg_converter, &arg8 \
+ )) return NULL; \
+ simd_data data = {.RET = npyv_##NAME( \
+ arg1.data.IN0, arg2.data.IN1, \
+ arg3.data.IN2, arg4.data.IN3, \
+ arg5.data.IN4, arg6.data.IN5, \
+ arg7.data.IN6, arg8.data.IN7 \
+ )}; \
+ simd_arg_free(&arg1); \
+ simd_arg_free(&arg2); \
+ simd_arg_free(&arg3); \
+ simd_arg_free(&arg4); \
+ simd_arg_free(&arg5); \
+ simd_arg_free(&arg6); \
+ simd_arg_free(&arg7); \
+ simd_arg_free(&arg8); \
+ simd_arg ret = { \
+ .data = data, .dtype = simd_data_##RET \
+ }; \
+ return simd_arg_to_obj(&ret); \
+ }
+
/**
* Helper macros for repeating and expand a certain macro.
* Mainly used for converting a scalar to an immediate constant.
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
index 64e051686..00ac0d38a 100644
--- a/numpy/core/src/common/simd/avx2/conversion.h
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -58,6 +58,36 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+ __m256i ab = _mm256_packs_epi16(a, b);
+ return npyv256_shuffle_odd(ab);
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+ __m256i ab = _mm256_packs_epi32(a, b);
+ __m256i cd = _mm256_packs_epi32(c, d);
+ __m256i abcd = npyv_pack_b8_b16(ab, cd);
+ return _mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+ npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+ __m256i ab = _mm256_packs_epi32(a, b);
+ __m256i cd = _mm256_packs_epi32(c, d);
+ __m256i ef = _mm256_packs_epi32(e, f);
+ __m256i gh = _mm256_packs_epi32(g, h);
+ __m256i abcd = _mm256_packs_epi32(ab, cd);
+ __m256i efgh = _mm256_packs_epi32(ef, gh);
+ __m256i all = npyv256_shuffle_odd(_mm256_packs_epi16(abcd, efgh));
+ __m256i rev128 = _mm256_alignr_epi8(all, all, 8);
+ return _mm256_unpacklo_epi16(all, rev128);
+}
+
// round to nearest integer (assuming even)
#define npyv_round_s32_f32 _mm256_cvtps_epi32
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index 5fc7719e9..99ef76dcb 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -114,6 +114,12 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_not_b32 npyv_not_u8
#define npyv_not_b64 npyv_not_u8
+// ANDC, ORC and XNOR
+#define npyv_andc_u8(A, B) _mm256_andnot_si256(B, A)
+#define npyv_andc_b8(A, B) _mm256_andnot_si256(B, A)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+#define npyv_xnor_b8 _mm256_cmpeq_epi8
+
/***************************
* Comparison
***************************/
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index 0bd44179b..474aee446 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -90,6 +90,48 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
return r;
}
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+#ifdef NPY_HAVE_AVX512BW
+ return _mm512_kunpackd((__mmask64)b, (__mmask64)a);
+#else
+ const __m512i idx = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+ return _mm512_permutexvar_epi64(idx, npyv512_packs_epi16(a, b));
+#endif
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+#ifdef NPY_HAVE_AVX512BW
+ __mmask32 ab = _mm512_kunpackw((__mmask32)b, (__mmask32)a);
+ __mmask32 cd = _mm512_kunpackw((__mmask32)d, (__mmask32)c);
+ return npyv_pack_b8_b16(ab, cd);
+#else
+ const __m512i idx = _mm512_setr_epi32(
+ 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
+ __m256i ta = npyv512_pack_lo_hi(npyv_cvt_u32_b32(a));
+ __m256i tb = npyv512_pack_lo_hi(npyv_cvt_u32_b32(b));
+ __m256i tc = npyv512_pack_lo_hi(npyv_cvt_u32_b32(c));
+ __m256i td = npyv512_pack_lo_hi(npyv_cvt_u32_b32(d));
+ __m256i ab = _mm256_packs_epi16(ta, tb);
+ __m256i cd = _mm256_packs_epi16(tc, td);
+ __m512i abcd = npyv512_combine_si256(ab, cd);
+ return _mm512_permutexvar_epi32(idx, abcd);
+#endif
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+ npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+ __mmask16 ab = _mm512_kunpackb((__mmask16)b, (__mmask16)a);
+ __mmask16 cd = _mm512_kunpackb((__mmask16)d, (__mmask16)c);
+ __mmask16 ef = _mm512_kunpackb((__mmask16)f, (__mmask16)e);
+ __mmask16 gh = _mm512_kunpackb((__mmask16)h, (__mmask16)g);
+ return npyv_pack_b8_b32(ab, cd, ef, gh);
+}
+
// convert boolean vectors to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index d53932fa8..b856b345a 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -140,6 +140,9 @@
#define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
#endif
+// ANDC
+#define npyv_andc_u8(A, B) _mm512_andnot_si512(B, A)
+
/***************************
* Logical (boolean)
***************************/
@@ -152,6 +155,9 @@
#define npyv_xor_b16 _kxor_mask32
#define npyv_not_b8 _knot_mask64
#define npyv_not_b16 _knot_mask32
+ #define npyv_andc_b8(A, B) _kandn_mask64(B, A)
+ #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+ #define npyv_xnor_b8 _kxnor_mask64
#elif defined(NPY_HAVE_AVX512BW)
NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b)
{ return a & b; }
@@ -169,6 +175,12 @@
{ return ~a; }
NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a)
{ return ~a; }
+ NPY_FINLINE npyv_b8 npyv_andc_b8(npyv_b8 a, npyv_b8 b)
+ { return a & (~b); }
+ NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b)
+ { return a | (~b); }
+ NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b)
+ { return ~(a ^ b); }
#else
#define npyv_and_b8 _mm512_and_si512
#define npyv_and_b16 _mm512_and_si512
@@ -178,6 +190,9 @@
#define npyv_xor_b16 _mm512_xor_si512
#define npyv_not_b8 npyv_not_u8
#define npyv_not_b16 npyv_not_u8
+ #define npyv_andc_b8(A, B) _mm512_andnot_si512(B, A)
+ #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+ #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
#endif
#define npyv_and_b32 _mm512_kand
diff --git a/numpy/core/src/common/simd/avx512/utils.h b/numpy/core/src/common/simd/avx512/utils.h
index c3079283f..ced3bfef0 100644
--- a/numpy/core/src/common/simd/avx512/utils.h
+++ b/numpy/core/src/common/simd/avx512/utils.h
@@ -87,4 +87,16 @@
)); \
}
+#ifndef NPY_HAVE_AVX512BW
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv512_packs_epi16, _mm256_packs_epi16)
+#else
+ #define npyv512_packs_epi16 _mm512_packs_epi16
+#endif
+
+NPY_FINLINE __m256i npyv512_pack_lo_hi(__m512i a) {
+ __m256i lo = npyv512_lower_si256(a);
+ __m256i hi = npyv512_higher_si256(a);
+ return _mm256_packs_epi32(lo, hi);
+}
+
#endif // _NPY_SIMD_AVX512_UTILS_H
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
index 7487559d1..b6a50dc7a 100644
--- a/numpy/core/src/common/simd/neon/conversion.h
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -86,6 +86,30 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+ return vcombine_u8(vmovn_u16(a), vmovn_u16(b));
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+ npyv_b16 ab = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
+ npyv_b16 cd = vcombine_u16(vmovn_u32(c), vmovn_u32(d));
+ return npyv_pack_b8_b16(ab, cd);
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+ npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+ npyv_b32 ab = vcombine_u32(vmovn_u64(a), vmovn_u64(b));
+ npyv_b32 cd = vcombine_u32(vmovn_u64(c), vmovn_u64(d));
+ npyv_b32 ef = vcombine_u32(vmovn_u64(e), vmovn_u64(f));
+ npyv_b32 gh = vcombine_u32(vmovn_u64(g), vmovn_u64(h));
+ return npyv_pack_b8_b32(ab, cd, ef, gh);
+}
+
// round to nearest integer
#if NPY_SIMD_F64
#define npyv_round_s32_f32 vcvtnq_s32_f32
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index b43ba3653..a08fa5390 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -116,6 +116,12 @@
#define npyv_not_b32 vmvnq_u32
#define npyv_not_b64 npyv_not_u64
+// ANDC, ORC and XNOR
+#define npyv_andc_u8 vbicq_u8
+#define npyv_andc_b8 vbicq_u8
+#define npyv_orc_b8 vornq_u8
+#define npyv_xnor_b8 vceqq_u8
+
/***************************
* Comparison
***************************/
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
index ab7eb4907..0811bf06a 100644
--- a/numpy/core/src/common/simd/sse/conversion.h
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -59,6 +59,30 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+ return _mm_packs_epi16(a, b);
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+ npyv_b16 ab = _mm_packs_epi32(a, b);
+ npyv_b16 cd = _mm_packs_epi32(c, d);
+ return npyv_pack_b8_b16(ab, cd);
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+ npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+ npyv_b32 ab = _mm_packs_epi32(a, b);
+ npyv_b32 cd = _mm_packs_epi32(c, d);
+ npyv_b32 ef = _mm_packs_epi32(e, f);
+ npyv_b32 gh = _mm_packs_epi32(g, h);
+ return npyv_pack_b8_b32(ab, cd, ef, gh);
+}
+
// round to nearest integer (assuming even)
#define npyv_round_s32_f32 _mm_cvtps_epi32
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 51c84fb4e..86dbcfea5 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -115,6 +115,12 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
#define npyv_not_b32 npyv_not_u8
#define npyv_not_b64 npyv_not_u8
+// ANDC, ORC and XNOR
+#define npyv_andc_u8(A, B) _mm_andnot_si128(B, A)
+#define npyv_andc_b8(A, B) _mm_andnot_si128(B, A)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+#define npyv_xnor_b8 _mm_cmpeq_epi8
+
/***************************
* Comparison
***************************/
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
index 36bea7bba..a599f3950 100644
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -48,6 +48,29 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
return r;
}
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+ return vec_pack(a, b);
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+ npyv_b16 ab = vec_pack(a, b);
+ npyv_b16 cd = vec_pack(c, d);
+ return npyv_pack_b8_b16(ab, cd);
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+ npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+ npyv_b32 ab = vec_pack(a, b);
+ npyv_b32 cd = vec_pack(c, d);
+ npyv_b32 ef = vec_pack(e, f);
+ npyv_b32 gh = vec_pack(g, h);
+ return npyv_pack_b8_b32(ab, cd, ef, gh);
+}
+
// convert boolean vector to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h
index d34057ff3..b01d85321 100644
--- a/numpy/core/src/common/simd/vsx/operators.h
+++ b/numpy/core/src/common/simd/vsx/operators.h
@@ -133,6 +133,12 @@ NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a)
NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
{ return vec_nor(a, a); }
+// ANDC, ORC and XNOR
+#define npyv_andc_u8 vec_andc
+#define npyv_andc_b8 vec_andc
+#define npyv_orc_b8 vec_orc
+#define npyv_xnor_b8 vec_eqv
+
/***************************
* Comparison
***************************/
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 3a8a54913..9ae686399 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -401,23 +401,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
*/
/**begin repeat
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
- * #OP = ==, !=, >, >=, <, <=#
- **/
-
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- BINARY_LOOP {
- npy_bool in1 = *((npy_bool *)ip1) != 0;
- npy_bool in2 = *((npy_bool *)ip2) != 0;
- *((npy_bool *)op1)= in1 @OP@ in2;
- }
-}
-/**end repeat**/
-
-
-/**begin repeat
* #kind = logical_and, logical_or#
* #OP = &&, ||#
* #SC = ==, !=#
@@ -688,9 +671,8 @@ void
/**begin repeat2
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
- * logical_and, logical_or#
- * #OP = ==, !=, >, >=, <, <=, &&, ||#
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
*/
#if @CHK@
@@ -1408,19 +1390,16 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
* #C = F, , L#
*/
/**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
- * logical_and, logical_or#
- * #OP = ==, !=, <, <=, >, >=, &&, ||#
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
*/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
- if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
- BINARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- const @type@ in2 = *(@type@ *)ip2;
- *((npy_bool *)op1) = in1 @OP@ in2;
- }
+ BINARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ const @type@ in2 = *(@type@ *)ip2;
+ *((npy_bool *)op1) = in1 @OP@ in2;
}
npy_clear_floatstatus_barrier((char*)dimensions);
}
@@ -1654,6 +1633,22 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps
}
/**end repeat**/
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ BINARY_LOOP {
+ const npy_longdouble in1 = *(npy_longdouble *)ip1;
+ const npy_longdouble in2 = *(npy_longdouble *)ip2;
+ *((npy_bool *)op1) = in1 @OP@ in2;
+ }
+ npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat**/
+
NPY_NO_EXPORT void
LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 694518ae0..5af9f1788 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -28,9 +28,19 @@
*****************************************************************************
*/
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_comparison.dispatch.h"
+#endif
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
/**begin repeat
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
- * logical_and, logical_or, absolute, logical_not#
+ * #kind = logical_and, logical_or, absolute, logical_not#
**/
NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -60,8 +70,8 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
BYTE, SHORT, INT, LONG, LONGLONG#
*/
- NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
- (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
/**end repeat**/
#ifndef NPY_DISABLE_OPTIMIZATION
@@ -72,14 +82,28 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
BYTE, SHORT, INT, LONG, LONGLONG#
*/
- NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divmod,
- (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**begin repeat1
+ * #kind = divmod, fmod, remainder#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
- NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_fmod,
- (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_comparison.dispatch.h"
+#endif
- NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_remainder,
- (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
/**end repeat**/
/**begin repeat
@@ -136,9 +160,8 @@ NPY_NO_EXPORT void
/**end repeat3**/
/**begin repeat3
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
- * logical_and, logical_or#
- * #OP = ==, !=, >, >=, <, <=, &&, ||#
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
*/
NPY_NO_EXPORT void
@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -232,9 +255,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
/**end repeat1**/
/**end repeat**/
-/**end repeat1**/
-/**end repeat**/
-
// SVML
#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_umath_fp.dispatch.h"
@@ -300,6 +320,21 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
/**end repeat1**/
/**end repeat**/
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_comparison.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
+ char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat1**/
+/**end repeat**/
+
/**begin repeat
* Float types
* #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
@@ -307,7 +342,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
* #C = F, F, , L#
*/
-
/**begin repeat1
* Arithmetic
* # kind = add, subtract, multiply, divide#
@@ -318,9 +352,8 @@ NPY_NO_EXPORT void
/**end repeat1**/
/**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
- * logical_and, logical_or#
- * #OP = ==, !=, <, <=, >, >=, &&, ||#
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
*/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -407,6 +440,16 @@ NPY_NO_EXPORT void
@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat**/
+/**begin repeat
+ * #TYPE = HALF, LONGDOUBLE#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+/**end repeat**/
/*
*****************************************************************************
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
new file mode 100644
index 000000000..01d58fbf9
--- /dev/null
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -0,0 +1,451 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse42 avx2 avx512f avx512_skx
+ ** vsx2 vsx3
+ ** neon
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/**begin repeat
+ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #len = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64#
+ * #signed = 0, 1, 0, 1, 0, 1, 0, 1, 0, 0#
+ * #VECTOR = NPY_SIMD*9, NPY_SIMD_F64#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal#
+ * #eq = 1, 0, 0, 0#
+ * #neq = 0, 1, 0, 0#
+ * #OP = ==, !=, <, <=#
+ * #VOP = cmpeq, cmpneq, cmplt, cmple#
+ */
+#if @VECTOR@ && !((@eq@ || @neq@) && @signed@)
+static void simd_binary_@kind@_@sfx@(char **args, npy_intp len)
+{
+ npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
+ npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
+ npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2];
+ const npyv_u8 truemask = npyv_setall_u8(0x1);
+ const int vstep = npyv_nlanes_u8;
+
+ // Unroll the loop to get a resultant vector with 'vsteps' elements.
+ for (; len >= vstep;
+ len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if @len@ >= 8
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 0);
+ npyv_@sfx@ b1 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 0);
+ npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b1);
+#if @len@ >= 16
+ npyv_@sfx@ a2 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 1);
+ npyv_@sfx@ b2 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 1);
+ npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b2);
+#if @len@ >= 32
+ npyv_@sfx@ a3 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 2);
+ npyv_@sfx@ b3 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 2);
+ npyv_@sfx@ a4 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 3);
+ npyv_@sfx@ b4 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 3);
+ npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b3);
+ npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b4);
+#if @len@ == 64
+ npyv_@sfx@ a5 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 4);
+ npyv_@sfx@ b5 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 4);
+ npyv_@sfx@ a6 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 5);
+ npyv_@sfx@ b6 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 5);
+ npyv_@sfx@ a7 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 6);
+ npyv_@sfx@ b7 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 6);
+ npyv_@sfx@ a8 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 7);
+ npyv_@sfx@ b8 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 7);
+ npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b5);
+ npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b6);
+ npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b7);
+ npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b8);
+#endif // @len@ >= 64
+#endif // @len@ >= 32
+#endif // @len@ >= 16
+#endif // @len@ >= 8
+
+ // Pack the 'c' vectors into a single vector 'r'
+#if @len@ == 8
+ npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif @len@ == 16
+ npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif @len@ == 32
+ npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif @len@ == 64
+ npyv_u8 r =
+ npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+ npyv_store_u8(dst, npyv_and_u8(r, truemask));
+ }
+
+ for (; len > 0; --len, ++src1, ++src2, ++dst) {
+ const npyv_lanetype_@sfx@ a = *src1;
+ const npyv_lanetype_@sfx@ b = *src2;
+ *dst = a @OP@ b;
+ }
+}
+
+static void simd_binary_scalar1_@kind@_@sfx@(char **args, npy_intp len)
+{
+ npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[0];
+ npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[1];
+ npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2];
+ const npyv_@sfx@ a = npyv_setall_@sfx@(scalar);
+ const npyv_u8 truemask = npyv_setall_u8(0x1);
+ const int vstep = npyv_nlanes_u8;
+
+ for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if @len@ >= 8
+ npyv_@sfx@ b1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0);
+ npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a, b1);
+#if @len@ >= 16
+ npyv_@sfx@ b2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1);
+ npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a, b2);
+#if @len@ >= 32
+ npyv_@sfx@ b3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2);
+ npyv_@sfx@ b4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3);
+ npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a, b3);
+ npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a, b4);
+#if @len@ == 64
+ npyv_@sfx@ b5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4);
+ npyv_@sfx@ b6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5);
+ npyv_@sfx@ b7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6);
+ npyv_@sfx@ b8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7);
+ npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a, b5);
+ npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a, b6);
+ npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a, b7);
+ npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a, b8);
+#endif // @len@ >= 64
+#endif // @len@ >= 32
+#endif // @len@ >= 16
+#endif // @len@ >= 8
+
+#if @len@ == 8
+ npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif @len@ == 16
+ npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif @len@ == 32
+ npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif @len@ == 64
+ npyv_u8 r =
+ npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+ npyv_store_u8(dst, npyv_and_u8(r, truemask));
+ }
+
+ for (; len > 0; --len, ++src, ++dst) {
+ const npyv_lanetype_@sfx@ b = *src;
+ *dst = scalar @OP@ b;
+ }
+}
+
+static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len)
+{
+ npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0];
+ npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
+ npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2];
+ const npyv_@sfx@ b = npyv_setall_@sfx@(scalar);
+ const npyv_u8 truemask = npyv_setall_u8(0x1);
+ const int vstep = npyv_nlanes_u8;
+
+ for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if @len@ >= 8
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0);
+ npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b);
+#if @len@ >= 16
+ npyv_@sfx@ a2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1);
+ npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b);
+#if @len@ >= 32
+ npyv_@sfx@ a3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2);
+ npyv_@sfx@ a4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3);
+ npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b);
+ npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b);
+#if @len@ == 64
+ npyv_@sfx@ a5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4);
+ npyv_@sfx@ a6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5);
+ npyv_@sfx@ a7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6);
+ npyv_@sfx@ a8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7);
+ npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b);
+ npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b);
+ npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b);
+ npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b);
+#endif // @len@ >= 64
+#endif // @len@ >= 32
+#endif // @len@ >= 16
+#endif // @len@ >= 8
+
+#if @len@ == 8
+ npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif @len@ == 16
+ npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif @len@ == 32
+ npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif @len@ == 64
+ npyv_u8 r =
+ npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+ npyv_store_u8(dst, npyv_and_u8(r, truemask));
+ }
+
+ for (; len > 0; --len, ++src, ++dst) {
+ const npyv_lanetype_@sfx@ a = *src;
+ *dst = a @OP@ scalar;
+ }
+}
+#endif
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal#
+ * #eq = 1, 0, 0, 0#
+ * #neq = 0, 1, 0, 0#
+ * #OP = ==, !=, <, <=#
+ * #VOP = xnor, xor, andc, orc#
+ */
+
+#if NPY_SIMD
+static void simd_binary_@kind@_b8(char **args, npy_intp len)
+{
+ npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+ npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+ npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2];
+ const npyv_u8 truemask = npyv_setall_u8(0x1);
+ const npyv_u8 vzero = npyv_setall_u8(0x0);
+ const int vstep = npyv_nlanes_u8;
+
+ for (; len >= vstep;
+ len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+ // Whatever element in src != 0x0 is converted to 0xFF
+ npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
+ npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
+ npyv_b8 c = npyv_@VOP@_b8(a, b);
+ npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+ }
+
+ for (; len > 0; --len, ++src1, ++src2, ++dst) {
+ const npyv_lanetype_u8 a = *src1 != 0;
+ const npyv_lanetype_u8 b = *src2 != 0;
+ *dst = a @OP@ b;
+ }
+}
+
+static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
+{
+ npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+ npyv_lanetype_u8 *src = (npyv_lanetype_u8 *) args[1];
+ npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2];
+ const npyv_u8 vzero = npyv_setall_u8(0x0);
+ const npyv_u8 vscalar = npyv_setall_u8(scalar);
+ const npyv_b8 a = npyv_cmpeq_u8(vscalar, vzero);
+ const npyv_u8 truemask = npyv_setall_u8(0x1);
+ const int vstep = npyv_nlanes_u8;
+
+ for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+ npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+ npyv_b8 c = npyv_@VOP@_b8(a, b);
+ npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+ }
+
+ for (; len > 0; --len, ++src, ++dst) {
+ const npyv_lanetype_u8 b = *src != 0;
+ *dst = scalar @OP@ b;
+ }
+}
+
+static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
+{
+ npyv_lanetype_u8 *src = (npyv_lanetype_u8 *) args[0];
+ npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+ npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2];
+ const npyv_u8 vzero = npyv_setall_u8(0x0);
+ const npyv_u8 vscalar = npyv_setall_u8(scalar);
+ const npyv_b8 b = npyv_cmpeq_u8(vscalar, vzero);
+ const npyv_u8 truemask = npyv_setall_u8(0x1);
+ const int vstep = npyv_nlanes_u8;
+
+ for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+ npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+ npyv_b8 c = npyv_@VOP@_b8(a, b);
+ npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+ }
+
+ for (; len > 0; --len, ++src, ++dst) {
+ const npyv_lanetype_u8 a = *src != 0;
+ *dst = a @OP@ scalar;
+ }
+}
+#endif
+/**end repeat**/
+
+/**begin repeat
+ * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int,
+ npy_ulonglong, npy_longlong, npy_float, npy_double#
+ * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
+ * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0#
+ * #VECTOR = NPY_SIMD*10, NPY_SIMD_F64#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal#
+ * #eq = 1, 0, 0, 0#
+ * #neq = 0, 1, 0, 0#
+ * #OP = ==, !=, <, <=#
+ */
+#if !((@eq@ || @neq@) && @signed@)
+static NPY_INLINE void
+run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if @VECTOR@
+ /* argument one scalar */
+ if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
+ return;
+ }
+ /* argument two scalar */
+ else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
+ return;
+ }
+ else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ simd_binary_@kind@_@sfx@(args, dimensions[0]);
+ return;
+ }
+#endif
+
+ BINARY_LOOP {
+#if @bool@
+ npy_bool in1 = *((npy_bool *)ip1) != 0;
+ npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+ const @type@ in1 = *(@type@ *)ip1;
+ const @type@ in2 = *(@type@ *)ip2;
+#endif
+ *((npy_bool *)op1) = in1 @OP@ in2;
+ }
+}
+#endif
+/**end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+
+/*
+ * In order to reduce the size of the binary generated from this source, the
+ * following rules are applied: 1) each data type implements its function
+ * 'greater' as a call to the function 'less' but with the arguments swapped,
+ * the same applies to the function 'greater_equal', which is implemented
+ * with a call to the function 'less_equal', and 2) for the integer datatypes
+ * of the same size (eg 8-bit), a single kernel of the functions 'equal' and
+ * 'not_equal' is used to implement both signed and unsigned types.
+ */
+
+/**begin repeat
+ * Signed and Unsigned types
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG#
+ * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG#
+ * #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1#
+ */
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@STYPE@ == @len@
+ #define TO_SIMD_UTYPE(X) X##_u@len@
+ #if @signed@
+ #define TO_SIMD_SFX(X) X##_s@len@
+ #else
+ #define TO_SIMD_SFX(X) X##_u@len@
+ #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #kind = greater, greater_equal#
+ * #kind_to = less, less_equal#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ char *nargs[3] = {args[1], args[0], args[2]};
+ npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+ TO_SIMD_SFX(run_binary_simd_@kind_to@)(nargs, dimensions, nsteps);
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = less, less_equal#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps);
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ TO_SIMD_UTYPE(run_binary_simd_@kind@)(args, dimensions, steps);
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * Boolean & Float types
+ * #TYPE = BOOL, FLOAT, DOUBLE#
+ * #sfx = b8, f32, f64#
+ * #fp = 0, 1, 1#
+ */
+/**begin repeat1
+ * #kind = greater, greater_equal#
+ * #kind_to = less, less_equal#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ char *nargs[3] = {args[1], args[0], args[2]};
+ npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+ run_binary_simd_@kind_to@_@sfx@(nargs, dimensions, nsteps);
+#if @fp@
+ npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ run_binary_simd_@kind@_@sfx@(args, dimensions, steps);
+#if @fp@
+ npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index b477027b3..d6c9a7e65 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -159,55 +159,6 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
/**end repeat1**/
/**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
- * logical_and, logical_or#
- * #simd = 1, 1, 1, 1, 1, 1, 0, 0#
- */
-
-#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-/* prototypes */
-static void
-sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
- npy_intp n);
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
- npy_intp n);
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
- npy_intp n);
-
-#endif
-
-static NPY_INLINE int
-run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
- @type@ * ip1 = (@type@ *)args[0];
- @type@ * ip2 = (@type@ *)args[1];
- npy_bool * op = (npy_bool *)args[2];
- npy_intp n = dimensions[0];
- /* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
- sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- /* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
- sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
- sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
-#endif
- return 0;
-}
-
-/**end repeat1**/
-
-/**begin repeat1
* #kind = isnan, isfinite, isinf, signbit#
*/
@@ -476,101 +427,6 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
/**end repeat1**/
-/**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
- * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
-*/
-
-/* sets invalid fpu flag on QNaN for consistency with packed compare */
-NPY_FINLINE int
-sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
-{
- @vtype@ one = @vpre@_set1_@vsuf@(1);
- @type@ tmp;
- @vtype@ v = @vpre@_@VOP@_@vsufs@(@vpre@_load_@vsufs@(&a),
- @vpre@_load_@vsufs@(&b));
- v = @vpre@_and_@vsuf@(v, one);
- @vpre@_store_@vsufs@(&tmp, v);
- return tmp;
-}
-
-static void
-sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
- op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
- }
- LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
- @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
- @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
- @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
- @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
- sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
- }
- LOOP_BLOCKED_END {
- op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
- }
-}
-
-
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- @vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(ip2, @type@, VECTOR_SIZE_BYTES) {
- op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
- }
- LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a);
- @vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
- @vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
- @vtype@ r4 = @vpre@_@VOP@_@vsuf@(s, d);
- sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
- }
- LOOP_BLOCKED_END {
- op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
- }
-}
-
-
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- @vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
- op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
- }
- LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s);
- @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
- @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
- @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, s);
- sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
- }
- LOOP_BLOCKED_END {
- op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
- }
-}
-/**end repeat1**/
-
-
static void
sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
{
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index e4b5e0c8f..f33db95fc 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -126,7 +126,8 @@ class _SIMD_BOOL(_Test_Utility):
"""
Logical operations for boolean types.
Test intrinsics:
- npyv_xor_##SFX, npyv_and_##SFX, npyv_or_##SFX, npyv_not_##SFX
+ npyv_xor_##SFX, npyv_and_##SFX, npyv_or_##SFX, npyv_not_##SFX,
+ npyv_andc_b8, npvy_orc_b8, nvpy_xnor_b8
"""
data_a = self._data()
data_b = self._data(reverse=True)
@@ -148,6 +149,22 @@ class _SIMD_BOOL(_Test_Utility):
vnot = getattr(self, "not")(vdata_a)
assert vnot == data_b
+ # among the boolean types, andc, orc and xnor only support b8
+ if self.sfx not in ("b8"):
+ return
+
+ data_andc = [(a & ~b) & 0xFF for a, b in zip(data_a, data_b)]
+ vandc = getattr(self, "andc")(vdata_a, vdata_b)
+ assert data_andc == vandc
+
+ data_orc = [(a | ~b) & 0xFF for a, b in zip(data_a, data_b)]
+ vorc = getattr(self, "orc")(vdata_a, vdata_b)
+ assert data_orc == vorc
+
+ data_xnor = [~(a ^ b) & 0xFF for a, b in zip(data_a, data_b)]
+ vxnor = getattr(self, "xnor")(vdata_a, vdata_b)
+ assert data_xnor == vxnor
+
def test_tobits(self):
data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)])
for data in (self._data(), self._data(reverse=True)):
@@ -156,6 +173,37 @@ class _SIMD_BOOL(_Test_Utility):
tobits = bin(self.tobits(vdata))
assert tobits == bin(data_bits)
+ def test_pack(self):
+ """
+ Pack multiple vectors into one
+ Test intrinsics:
+ npyv_pack_b8_b16
+ npyv_pack_b8_b32
+ npyv_pack_b8_b64
+ """
+ if self.sfx not in ("b16", "b32", "b64"):
+ return
+ # create the vectors
+ data = self._data()
+ rdata = self._data(reverse=True)
+ vdata = self._load_b(data)
+ vrdata = self._load_b(rdata)
+ pack_simd = getattr(self.npyv, f"pack_b8_{self.sfx}")
+ # for scalar execution, concatenate the elements of the multiple lists
+ # into a single list (spack) and then iterate over the elements of
+ # the created list applying a mask to capture the first byte of them.
+ if self.sfx == "b16":
+ spack = [(i & 0xFF) for i in (list(rdata) + list(data))]
+ vpack = pack_simd(vrdata, vdata)
+ elif self.sfx == "b32":
+ spack = [(i & 0xFF) for i in (2*list(rdata) + 2*list(data))]
+ vpack = pack_simd(vrdata, vrdata, vdata, vdata)
+ elif self.sfx == "b64":
+ spack = [(i & 0xFF) for i in (4*list(rdata) + 4*list(data))]
+ vpack = pack_simd(vrdata, vrdata, vrdata, vrdata,
+ vdata, vdata, vdata, vdata)
+ assert vpack == spack
+
class _SIMD_INT(_Test_Utility):
"""
To test all integer vector types at once
@@ -792,6 +840,12 @@ class _SIMD_ALL(_Test_Utility):
vnot = cast(getattr(self, "not")(vdata_a))
assert vnot == data_not
+ if self.sfx not in ("u8"):
+ return
+ data_andc = [a & ~b for a, b in zip(data_cast_a, data_cast_b)]
+ vandc = cast(getattr(self, "andc")(vdata_a, vdata_b))
+ assert vandc == data_andc
+
def test_conversion_boolean(self):
bsfx = "b" + self.sfx[1:]
to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx))
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index dd0bb88ff..7b6e2ee92 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -185,6 +185,52 @@ class TestOut:
class TestComparisons:
+ import operator
+
+ @pytest.mark.parametrize('dtype', np.sctypes['uint'] + np.sctypes['int'] +
+ np.sctypes['float'] + [np.bool_])
+ @pytest.mark.parametrize('py_comp,np_comp', [
+ (operator.lt, np.less),
+ (operator.le, np.less_equal),
+ (operator.gt, np.greater),
+ (operator.ge, np.greater_equal),
+ (operator.eq, np.equal),
+ (operator.ne, np.not_equal)
+ ])
+ def test_comparison_functions(self, dtype, py_comp, np_comp):
+ # Initialize input arrays
+ if dtype == np.bool_:
+ a = np.random.choice(a=[False, True], size=1000)
+ b = np.random.choice(a=[False, True], size=1000)
+ scalar = True
+ else:
+ a = np.random.randint(low=1, high=10, size=1000).astype(dtype)
+ b = np.random.randint(low=1, high=10, size=1000).astype(dtype)
+ scalar = 5
+ np_scalar = np.dtype(dtype).type(scalar)
+ a_lst = a.tolist()
+ b_lst = b.tolist()
+
+ # (Binary) Comparison (x1=array, x2=array)
+ comp_b = np_comp(a, b)
+ comp_b_list = [py_comp(x, y) for x, y in zip(a_lst, b_lst)]
+
+ # (Scalar1) Comparison (x1=scalar, x2=array)
+ comp_s1 = np_comp(np_scalar, b)
+ comp_s1_list = [py_comp(scalar, x) for x in b_lst]
+
+ # (Scalar2) Comparison (x1=array, x2=scalar)
+ comp_s2 = np_comp(a, np_scalar)
+ comp_s2_list = [py_comp(x, scalar) for x in a_lst]
+
+ # Sequence: Binary, Scalar1 and Scalar2
+ assert_(comp_b.tolist() == comp_b_list,
+ f"Failed comparision ({py_comp.__name__})")
+ assert_(comp_s1.tolist() == comp_s1_list,
+ f"Failed comparision ({py_comp.__name__})")
+ assert_(comp_s2.tolist() == comp_s2_list,
+ f"Failed comparision ({py_comp.__name__})")
+
def test_ignore_object_identity_in_equal(self):
# Check comparing identical objects whose comparison
# is not a simple boolean, e.g., arrays that are compared elementwise.