summaryrefslogtreecommitdiff
path: root/numpy/core/src/common
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2022-02-14 07:12:48 +0200
committerSayed Adel <seiko@imavr.com>2023-01-29 13:02:39 +0200
commitde95f3cfbafb08674b6dfd13f780703ebd48bf10 (patch)
treeeb3f0f1123c0a10b33c2143dd5ee4f6e1446586b /numpy/core/src/common
parent640e85017aa8eac3e9be68b475acf27d623b16b7 (diff)
downloadnumpy-de95f3cfbafb08674b6dfd13f780703ebd48bf10.tar.gz
ENH: Implement intrinsics for shuffle over 128-bit lane and unzip
shuffle intrinsics support 32-bit/64-bit vector data types, unzip(deinterleave) intrinsics supports all data types.
Diffstat (limited to 'numpy/core/src/common')
-rw-r--r--numpy/core/src/common/simd/avx2/reorder.h87
-rw-r--r--numpy/core/src/common/simd/avx512/reorder.h152
-rw-r--r--numpy/core/src/common/simd/neon/reorder.h120
-rw-r--r--numpy/core/src/common/simd/sse/reorder.h87
-rw-r--r--numpy/core/src/common/simd/vec/reorder.h99
5 files changed, 520 insertions, 25 deletions
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 4d6ec8f75..9ebe0e7f4 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
return npyv_combine_f64(ab0, ab1);
}
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+ __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+ npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12);
+ npyv_u8x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+ __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+ npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12);
+ npyv_u16x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+ __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx);
+ __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx);
+ return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+ npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1);
+ npyv_u64x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+ __m256 abl = _mm256_permutevar8x32_ps(ab0, idx);
+ __m256 abh = _mm256_permutevar8x32_ps(ab1, idx);
+ return npyv_combine_f32(abl, abh);
+}
+
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+ npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1);
+ npyv_f64x2 r;
+ r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))
+
#endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index c0b2477f3..27e66b5e7 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
return r;
}
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ npyv_u8x2 r;
+#ifdef NPY_HAVE_AVX512VBMI
+ const __m512i idx_a = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126
+ );
+ const __m512i idx_b = npyv_set_u8(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+ 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
+ );
+ r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1);
+#else
+ #ifdef NPY_HAVE_AVX512BW
+ const __m512i idx = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m512i abl = _mm512_shuffle_epi8(ab0, idx);
+ __m512i abh = _mm512_shuffle_epi8(ab1, idx);
+ #else
+ const __m256i idx = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx);
+ __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+ __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx);
+ __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+ __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+ __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+ #endif
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+ r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+ return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ npyv_u16x2 r;
+#ifdef NPY_HAVE_AVX512BW
+ const __m512i idx_a = npyv_set_u16(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ );
+ const __m512i idx_b = npyv_set_u16(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ );
+ r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1);
+#else
+ const __m256i idx = _mm256_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx);
+ __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+ __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx);
+ __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+ __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+ __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+ r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+ return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ const __m512i idx_a = npyv_set_u32(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const __m512i idx_b = npyv_set_u32(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_u32x2 r;
+ r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1);
+ return r;
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ npyv_u64x2 r;
+ r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1);
+ return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ const __m512i idx_a = npyv_set_u32(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const __m512i idx_b = npyv_set_u32(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_f32x2 r;
+ r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1);
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ npyv_f64x2 r;
+ r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1);
+ return r;
+}
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)))
+
#endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 50b06ed11..8bf68f5be 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32)
NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#endif
-// interleave two vectors
-#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
- NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
- { \
- T_VEC##x2 r; \
- r.val[0] = vzip1q_##SFX(a, b); \
- r.val[1] = vzip2q_##SFX(a, b); \
- return r; \
- }
-
+// interleave & deinterleave two vectors
#ifdef __aarch64__
- NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
- NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
- NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
- NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
- NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
- NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
- NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
- NPYV_IMPL_NEON_ZIP(npyv_f64, f64)
+ #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vzip1q_##SFX(a, b); \
+ r.val[1] = vzip2q_##SFX(a, b); \
+ return r; \
+ } \
+ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vuzp1q_##SFX(a, b); \
+ r.val[1] = vuzp2q_##SFX(a, b); \
+ return r; \
+ }
#else
- #define npyv_zip_u8 vzipq_u8
- #define npyv_zip_s8 vzipq_s8
- #define npyv_zip_u16 vzipq_u16
- #define npyv_zip_s16 vzipq_s16
- #define npyv_zip_u32 vzipq_u32
- #define npyv_zip_s32 vzipq_s32
- #define npyv_zip_f32 vzipq_f32
+ #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { return vzipq_##SFX(a, b); } \
+ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+ { return vuzpq_##SFX(a, b); }
#endif
+
+NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
+NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
+NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
+NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
+NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
+NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
+NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
+
#define npyv_zip_u64 npyv_combine_u64
#define npyv_zip_s64 npyv_combine_s64
+#define npyv_zip_f64 npyv_combine_f64
+#define npyv_unzip_u64 npyv_combine_u64
+#define npyv_unzip_s64 npyv_combine_s64
+#define npyv_unzip_f64 npyv_combine_f64
// Reverse elements of each 64-bit lane
#define npyv_rev64_u8 vrev64q_u8
@@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#define npyv_rev64_s32 vrev64q_s32
#define npyv_rev64_f32 vrev64q_f32
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#ifdef __clang__
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ __builtin_shufflevector(A, A, E0, E1, E2, E3)
+#elif defined(__GNUC__)
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3))
+#else
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ npyv_set_u32( \
+ vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \
+ vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3) \
+ )
+ #define npyv_permi128_s32(A, E0, E1, E2, E3) \
+ npyv_set_s32( \
+ vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \
+ vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3) \
+ )
+ #define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ npyv_set_f32( \
+ vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \
+ vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3) \
+ )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+ #define npyv_permi128_s32 npyv_permi128_u32
+ #define npyv_permi128_f32 npyv_permi128_u32
+#endif
+
+#ifdef __clang__
+ #define npyv_permi128_u64(A, E0, E1) \
+ __builtin_shufflevector(A, A, E0, E1)
+#elif defined(__GNUC__)
+ #define npyv_permi128_u64(A, E0, E1) \
+ __builtin_shuffle(A, npyv_set_u64(E0, E1))
+#else
+ #define npyv_permi128_u64(A, E0, E1) \
+ npyv_set_u64( \
+ vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1) \
+ )
+ #define npyv_permi128_s64(A, E0, E1) \
+ npyv_set_s64( \
+ vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1) \
+ )
+ #define npyv_permi128_f64(A, E0, E1) \
+ npyv_set_f64( \
+ vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1) \
+ )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+ #define npyv_permi128_s64 npyv_permi128_u64
+ #define npyv_permi128_f64 npyv_permi128_u64
+#endif
+
+#if !NPY_SIMD_F64
+ #undef npyv_permi128_f64
+#endif
+
#endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index d96ab9c56..9a57f6489 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m128i abl = _mm_shuffle_epi8(ab0, idx);
+ __m128i abh = _mm_shuffle_epi8(ab1, idx);
+ return npyv_combine_u8(abl, abh);
+#else
+ __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1);
+ __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1);
+ __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e);
+ __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e);
+ __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be);
+ __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be);
+ npyv_u8x2 r;
+ r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8);
+ r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8);
+ return r;
+#endif
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m128i abl = _mm_shuffle_epi8(ab0, idx);
+ __m128i abh = _mm_shuffle_epi8(ab1, idx);
+ return npyv_combine_u16(abl, abh);
+#else
+ __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1);
+ __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1);
+ __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f);
+ __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f);
+ npyv_u16x2 r;
+ r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f);
+ r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f);
+ return r;
+#endif
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0));
+ __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0));
+ return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ npyv_f32x2 r;
+ r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0));
+ r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1));
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
{
@@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0))
+
#endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h
index b60b9287d..3910980a2 100644
--- a/numpy/core/src/common/simd/vec/reorder.h
+++ b/numpy/core/src/common/simd/vec/reorder.h
@@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64)
#endif
NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64)
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ const npyv_u8 idx_even = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const npyv_u8 idx_odd = npyv_set_u8(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_u8x2 r;
+ r.val[0] = vec_perm(ab0, ab1, idx_even);
+ r.val[1] = vec_perm(ab0, ab1, idx_odd);
+ return r;
+}
+NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1)
+{
+ npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1);
+ npyv_s8x2 r;
+ r.val[0] = (npyv_s8)ru.val[0];
+ r.val[1] = (npyv_s8)ru.val[1];
+ return r;
+}
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ const npyv_u8 idx_even = npyv_set_u8(
+ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+ );
+ const npyv_u8 idx_odd = npyv_set_u8(
+ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+ );
+ npyv_u16x2 r;
+ r.val[0] = vec_perm(ab0, ab1, idx_even);
+ r.val[1] = vec_perm(ab0, ab1, idx_odd);
+ return r;
+}
+NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1)
+{
+ npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1);
+ npyv_s16x2 r;
+ r.val[0] = (npyv_s16)ru.val[0];
+ r.val[1] = (npyv_s16)ru.val[1];
+ return r;
+}
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ npyv_u32 m0 = vec_mergeh(ab0, ab1);
+ npyv_u32 m1 = vec_mergel(ab0, ab1);
+ npyv_u32 r0 = vec_mergeh(m0, m1);
+ npyv_u32 r1 = vec_mergel(m0, m1);
+ npyv_u32x2 r;
+ r.val[0] = r0;
+ r.val[1] = r1;
+ return r;
+}
+NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1)
+{
+ npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+ npyv_s32x2 r;
+ r.val[0] = (npyv_s32)ru.val[0];
+ r.val[1] = (npyv_s32)ru.val[1];
+ return r;
+}
+#if NPY_SIMD_F32
+ NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+ {
+ npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+ npyv_f32x2 r;
+ r.val[0] = (npyv_f32)ru.val[0];
+ r.val[1] = (npyv_f32)ru.val[1];
+ return r;
+ }
+#endif
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1)
+{ return npyv_combine_s64(ab0, ab1); }
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
#endif
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ vec_perm(A, A, npyv_set_u8( \
+ (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \
+ (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \
+ (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \
+ (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3 \
+ ))
+#define npyv_permi128_s32 npyv_permi128_u32
+#define npyv_permi128_f32 npyv_permi128_u32
+
+#if defined(__IBMC__) || defined(vec_permi)
+ #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1))
+#else
+ #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1))
+#endif
+#define npyv_permi128_s64 npyv_permi128_u64
+#define npyv_permi128_f64 npyv_permi128_u64
+
#endif // _NPY_SIMD_VEC_REORDER_H