summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2022-02-14 07:12:48 +0200
committerSayed Adel <seiko@imavr.com>2023-01-29 13:02:39 +0200
commitde95f3cfbafb08674b6dfd13f780703ebd48bf10 (patch)
treeeb3f0f1123c0a10b33c2143dd5ee4f6e1446586b /numpy/core
parent640e85017aa8eac3e9be68b475acf27d623b16b7 (diff)
downloadnumpy-de95f3cfbafb08674b6dfd13f780703ebd48bf10.tar.gz
ENH: Implement intrinsics for shuffle over 128-bit lane and unzip
shuffle intrinsics support 32-bit/64-bit vector data types, unzip(deinterleave) intrinsics supports all data types.
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src62
-rw-r--r--numpy/core/src/_simd/_simd_easyintrin.inc33
-rw-r--r--numpy/core/src/common/simd/avx2/reorder.h87
-rw-r--r--numpy/core/src/common/simd/avx512/reorder.h152
-rw-r--r--numpy/core/src/common/simd/neon/reorder.h120
-rw-r--r--numpy/core/src/common/simd/sse/reorder.h87
-rw-r--r--numpy/core/src/common/simd/vec/reorder.h99
-rw-r--r--numpy/core/tests/test_simd.py20
8 files changed, 633 insertions, 27 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 48023af80..847891386 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -300,7 +300,7 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
/**end repeat1**/
/**begin repeat1
- * # intrin = combine, zip#
+ * # intrin = combine, zip, unzip#
*/
SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
/**end repeat1**/
@@ -309,6 +309,60 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@)
#endif
+// special implementation to convert runtime constants to immediate values
+#if @size@ == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_@sfx@
+npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+ /**begin repeat1
+ * # en = e0, e1, e2, e3#
+ */
+ npyv_@sfx@ v@en@;
+ npyv_lanetype_@sfx@ d@en@[npyv_nlanes_@sfx@];
+ if (0) {}
+ /**begin repeat2
+ * # imm = 1, 2, 3#
+ */
+ else if (@en@ == @imm@) {
+ v@en@ = npyv_permi128_@sfx@(a, @imm@, @imm@, @imm@, @imm@);
+ }
+ /**end repeat2**/
+ else {
+ v@en@ = npyv_permi128_@sfx@(a, 0, 0, 0, 0);
+ }
+ npyv_store_@sfx@(d@en@, v@en@);
+ /**end repeat1**/
+ if (e0 == e1 && e0 == e2 && e0 == e3) {
+ return ve0;
+ }
+ for (int i = 0; i < npyv_nlanes_@sfx@; i += 4) {
+ de0[i+1] = de1[i+1];
+ de0[i+2] = de2[i+2];
+ de0[i+3] = de3[i+3];
+ }
+ return npyv_load_@sfx@(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8, u8, u8)
+#elif @size@ == 64
+NPY_FINLINE npyv_@sfx@
+npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1)
+{
+ if (e0 == 1 && e1 == 0) {
+ return npyv_permi128_@sfx@(a, 1, 0);
+ }
+ else if (e0 == 0 && e1 == 1) {
+ return npyv_permi128_@sfx@(a, 0, 1);
+ }
+ else if (e0 == 1 && e1 == 1) {
+ return npyv_permi128_@sfx@(a, 1, 1);
+ }
+ return npyv_permi128_@sfx@(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8)
+#endif
+
/***************************
* Operators
***************************/
@@ -584,7 +638,7 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
* Reorder
***************************/
/**begin repeat1
- * # intrin = combinel, combineh, combine, zip#
+ * # intrin = combinel, combineh, combine, zip, unzip#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
@@ -593,6 +647,10 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
SIMD_INTRIN_DEF(rev64_@sfx@)
#endif
+#if @size@ > 16
+{ "permi128_@sfx@", simd__intrin_permi128_@sfx@_, METH_VARARGS, NULL },
+#endif
+
/***************************
* Operators
***************************/
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
index f2e0da26e..e300e5484 100644
--- a/numpy/core/src/_simd/_simd_easyintrin.inc
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -197,6 +197,39 @@
return simd_arg_to_obj(&ret); \
}
+#define SIMD_IMPL_INTRIN_5(NAME, RET, IN0, IN1, IN2, IN3, IN4) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg arg1 = {.dtype = simd_data_##IN0}; \
+ simd_arg arg2 = {.dtype = simd_data_##IN1}; \
+ simd_arg arg3 = {.dtype = simd_data_##IN2}; \
+ simd_arg arg4 = {.dtype = simd_data_##IN3}; \
+ simd_arg arg5 = {.dtype = simd_data_##IN4}; \
+ if (!PyArg_ParseTuple( \
+ args, "O&O&O&O&O&:"NPY_TOSTRING(NAME), \
+ simd_arg_converter, &arg1, \
+ simd_arg_converter, &arg2, \
+ simd_arg_converter, &arg3, \
+ simd_arg_converter, &arg4, \
+ simd_arg_converter, &arg5 \
+ )) return NULL; \
+ simd_data data = {.RET = npyv_##NAME( \
+ arg1.data.IN0, arg2.data.IN1, \
+ arg3.data.IN2, arg4.data.IN3, \
+ arg5.data.IN4 \
+ )}; \
+ simd_arg_free(&arg1); \
+ simd_arg_free(&arg2); \
+ simd_arg_free(&arg3); \
+ simd_arg_free(&arg4); \
+ simd_arg_free(&arg5); \
+ simd_arg ret = { \
+ .data = data, .dtype = simd_data_##RET \
+ }; \
+ return simd_arg_to_obj(&ret); \
+ }
+
/**
* Helper macros for repeating and expand a certain macro.
* Mainly used for converting a scalar to an immediate constant.
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 4d6ec8f75..9ebe0e7f4 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
return npyv_combine_f64(ab0, ab1);
}
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+ __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+ npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12);
+ npyv_u8x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+ __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+ npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12);
+ npyv_u16x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+ __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx);
+ __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx);
+ return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+ npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1);
+ npyv_u64x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+ __m256 abl = _mm256_permutevar8x32_ps(ab0, idx);
+ __m256 abh = _mm256_permutevar8x32_ps(ab1, idx);
+ return npyv_combine_f32(abl, abh);
+}
+
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+ npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1);
+ npyv_f64x2 r;
+ r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))
+
#endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index c0b2477f3..27e66b5e7 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
return r;
}
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ npyv_u8x2 r;
+#ifdef NPY_HAVE_AVX512VBMI
+ const __m512i idx_a = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126
+ );
+ const __m512i idx_b = npyv_set_u8(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+ 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
+ );
+ r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1);
+#else
+ #ifdef NPY_HAVE_AVX512BW
+ const __m512i idx = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m512i abl = _mm512_shuffle_epi8(ab0, idx);
+ __m512i abh = _mm512_shuffle_epi8(ab1, idx);
+ #else
+ const __m256i idx = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx);
+ __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+ __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx);
+ __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+ __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+ __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+ #endif
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+ r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+ return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ npyv_u16x2 r;
+#ifdef NPY_HAVE_AVX512BW
+ const __m512i idx_a = npyv_set_u16(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ );
+ const __m512i idx_b = npyv_set_u16(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ );
+ r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1);
+#else
+ const __m256i idx = _mm256_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx);
+ __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+ __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx);
+ __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+ __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+ __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+ r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+ return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ const __m512i idx_a = npyv_set_u32(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const __m512i idx_b = npyv_set_u32(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_u32x2 r;
+ r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1);
+ return r;
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ npyv_u64x2 r;
+ r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1);
+ return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ const __m512i idx_a = npyv_set_u32(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const __m512i idx_b = npyv_set_u32(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_f32x2 r;
+ r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1);
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ npyv_f64x2 r;
+ r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1);
+ return r;
+}
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)))
+
#endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 50b06ed11..8bf68f5be 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32)
NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#endif
-// interleave two vectors
-#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
- NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
- { \
- T_VEC##x2 r; \
- r.val[0] = vzip1q_##SFX(a, b); \
- r.val[1] = vzip2q_##SFX(a, b); \
- return r; \
- }
-
+// interleave & deinterleave two vectors
#ifdef __aarch64__
- NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
- NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
- NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
- NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
- NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
- NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
- NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
- NPYV_IMPL_NEON_ZIP(npyv_f64, f64)
+ #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vzip1q_##SFX(a, b); \
+ r.val[1] = vzip2q_##SFX(a, b); \
+ return r; \
+ } \
+ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vuzp1q_##SFX(a, b); \
+ r.val[1] = vuzp2q_##SFX(a, b); \
+ return r; \
+ }
#else
- #define npyv_zip_u8 vzipq_u8
- #define npyv_zip_s8 vzipq_s8
- #define npyv_zip_u16 vzipq_u16
- #define npyv_zip_s16 vzipq_s16
- #define npyv_zip_u32 vzipq_u32
- #define npyv_zip_s32 vzipq_s32
- #define npyv_zip_f32 vzipq_f32
+ #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { return vzipq_##SFX(a, b); } \
+ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+ { return vuzpq_##SFX(a, b); }
#endif
+
+NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
+NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
+NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
+NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
+NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
+NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
+NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
+
#define npyv_zip_u64 npyv_combine_u64
#define npyv_zip_s64 npyv_combine_s64
+#define npyv_zip_f64 npyv_combine_f64
+#define npyv_unzip_u64 npyv_combine_u64
+#define npyv_unzip_s64 npyv_combine_s64
+#define npyv_unzip_f64 npyv_combine_f64
// Reverse elements of each 64-bit lane
#define npyv_rev64_u8 vrev64q_u8
@@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#define npyv_rev64_s32 vrev64q_s32
#define npyv_rev64_f32 vrev64q_f32
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#ifdef __clang__
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ __builtin_shufflevector(A, A, E0, E1, E2, E3)
+#elif defined(__GNUC__)
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3))
+#else
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ npyv_set_u32( \
+ vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \
+ vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3) \
+ )
+ #define npyv_permi128_s32(A, E0, E1, E2, E3) \
+ npyv_set_s32( \
+ vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \
+ vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3) \
+ )
+ #define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ npyv_set_f32( \
+ vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \
+ vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3) \
+ )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+ #define npyv_permi128_s32 npyv_permi128_u32
+ #define npyv_permi128_f32 npyv_permi128_u32
+#endif
+
+#ifdef __clang__
+ #define npyv_permi128_u64(A, E0, E1) \
+ __builtin_shufflevector(A, A, E0, E1)
+#elif defined(__GNUC__)
+ #define npyv_permi128_u64(A, E0, E1) \
+ __builtin_shuffle(A, npyv_set_u64(E0, E1))
+#else
+ #define npyv_permi128_u64(A, E0, E1) \
+ npyv_set_u64( \
+ vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1) \
+ )
+ #define npyv_permi128_s64(A, E0, E1) \
+ npyv_set_s64( \
+ vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1) \
+ )
+ #define npyv_permi128_f64(A, E0, E1) \
+ npyv_set_f64( \
+ vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1) \
+ )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+ #define npyv_permi128_s64 npyv_permi128_u64
+ #define npyv_permi128_f64 npyv_permi128_u64
+#endif
+
+#if !NPY_SIMD_F64
+ #undef npyv_permi128_f64
+#endif
+
#endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index d96ab9c56..9a57f6489 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m128i abl = _mm_shuffle_epi8(ab0, idx);
+ __m128i abh = _mm_shuffle_epi8(ab1, idx);
+ return npyv_combine_u8(abl, abh);
+#else
+ __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1);
+ __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1);
+ __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e);
+ __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e);
+ __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be);
+ __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be);
+ npyv_u8x2 r;
+ r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8);
+ r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8);
+ return r;
+#endif
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m128i abl = _mm_shuffle_epi8(ab0, idx);
+ __m128i abh = _mm_shuffle_epi8(ab1, idx);
+ return npyv_combine_u16(abl, abh);
+#else
+ __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1);
+ __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1);
+ __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f);
+ __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f);
+ npyv_u16x2 r;
+ r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f);
+ r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f);
+ return r;
+#endif
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0));
+ __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0));
+ return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ npyv_f32x2 r;
+ r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0));
+ r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1));
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
{
@@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0))
+
#endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h
index b60b9287d..3910980a2 100644
--- a/numpy/core/src/common/simd/vec/reorder.h
+++ b/numpy/core/src/common/simd/vec/reorder.h
@@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64)
#endif
NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64)
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ const npyv_u8 idx_even = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const npyv_u8 idx_odd = npyv_set_u8(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_u8x2 r;
+ r.val[0] = vec_perm(ab0, ab1, idx_even);
+ r.val[1] = vec_perm(ab0, ab1, idx_odd);
+ return r;
+}
+NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1)
+{
+ npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1);
+ npyv_s8x2 r;
+ r.val[0] = (npyv_s8)ru.val[0];
+ r.val[1] = (npyv_s8)ru.val[1];
+ return r;
+}
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ const npyv_u8 idx_even = npyv_set_u8(
+ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+ );
+ const npyv_u8 idx_odd = npyv_set_u8(
+ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+ );
+ npyv_u16x2 r;
+ r.val[0] = vec_perm(ab0, ab1, idx_even);
+ r.val[1] = vec_perm(ab0, ab1, idx_odd);
+ return r;
+}
+NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1)
+{
+ npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1);
+ npyv_s16x2 r;
+ r.val[0] = (npyv_s16)ru.val[0];
+ r.val[1] = (npyv_s16)ru.val[1];
+ return r;
+}
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ npyv_u32 m0 = vec_mergeh(ab0, ab1);
+ npyv_u32 m1 = vec_mergel(ab0, ab1);
+ npyv_u32 r0 = vec_mergeh(m0, m1);
+ npyv_u32 r1 = vec_mergel(m0, m1);
+ npyv_u32x2 r;
+ r.val[0] = r0;
+ r.val[1] = r1;
+ return r;
+}
+NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1)
+{
+ npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+ npyv_s32x2 r;
+ r.val[0] = (npyv_s32)ru.val[0];
+ r.val[1] = (npyv_s32)ru.val[1];
+ return r;
+}
+#if NPY_SIMD_F32
+ NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+ {
+ npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+ npyv_f32x2 r;
+ r.val[0] = (npyv_f32)ru.val[0];
+ r.val[1] = (npyv_f32)ru.val[1];
+ return r;
+ }
+#endif
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1)
+{ return npyv_combine_s64(ab0, ab1); }
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
#endif
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ vec_perm(A, A, npyv_set_u8( \
+ (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \
+ (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \
+ (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \
+ (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3 \
+ ))
+#define npyv_permi128_s32 npyv_permi128_u32
+#define npyv_permi128_f32 npyv_permi128_u32
+
+#if defined(__IBMC__) || defined(vec_permi)
+ #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1))
+#else
+ #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1))
+#endif
+#define npyv_permi128_s64 npyv_permi128_u64
+#define npyv_permi128_f64 npyv_permi128_u64
+
#endif // _NPY_SIMD_VEC_REORDER_H
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 2c16243db..9e65a4b17 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -906,6 +906,26 @@ class _SIMD_ALL(_Test_Utility):
rev64 = self.rev64(self.load(range(self.nlanes)))
assert rev64 == data_rev64
+ def test_reorder_permi128(self):
+ """
+ Test permuting elements for each 128-bit lane.
+ npyv_permi128_##sfx
+ """
+ ssize = self._scalar_size()
+ if ssize < 32:
+ return
+ data = self.load(self._data())
+ permn = 128//ssize
+ permd = permn-1
+ nlane128 = self.nlanes//permn
+
+ shfl = [0, 1] if ssize == 64 else [0, 2, 4, 6]
+ for i in range(permd):
+ indices = [(i >> shf) & permd for shf in shfl]
+ vperm = self.permi128(data, *indices)
+ data_vperm = [data[j] for j in indices]
+ assert vperm = data_vperm
+
@pytest.mark.parametrize('func, intrin', [
(operator.lt, "cmplt"),
(operator.le, "cmple"),