diff options
| author | Sayed Adel <seiko@imavr.com> | 2022-02-14 07:12:48 +0200 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2023-01-29 13:02:39 +0200 |
| commit | de95f3cfbafb08674b6dfd13f780703ebd48bf10 (patch) | |
| tree | eb3f0f1123c0a10b33c2143dd5ee4f6e1446586b /numpy/core/src/common | |
| parent | 640e85017aa8eac3e9be68b475acf27d623b16b7 (diff) | |
| download | numpy-de95f3cfbafb08674b6dfd13f780703ebd48bf10.tar.gz | |
ENH: Implement intrinsics for shuffle over 128-bit lane and unzip
shuffle intrinsics support 32-bit/64-bit vector data types,
unzip(deinterleave) intrinsics supports all data types.
Diffstat (limited to 'numpy/core/src/common')
| -rw-r--r-- | numpy/core/src/common/simd/avx2/reorder.h | 87 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/reorder.h | 152 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/neon/reorder.h | 120 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/reorder.h | 87 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vec/reorder.h | 99 |
5 files changed, 520 insertions, 25 deletions
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h index 4d6ec8f75..9ebe0e7f4 100644 --- a/numpy/core/src/common/simd/avx2/reorder.h +++ b/numpy/core/src/common/simd/avx2/reorder.h @@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b) return npyv_combine_f64(ab0, ab1); } +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ + const __m256i idx = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx); + __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx); + npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12); + npyv_u8x2 r; + r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]); + return r; +} +#define npyv_unzip_s8 npyv_unzip_u8 + +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ + const __m256i idx = _mm256_setr_epi8( + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15, + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 + ); + __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx); + __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx); + npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12); + npyv_u16x2 r; + r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]); + return r; +} +#define npyv_unzip_s16 npyv_unzip_u16 + +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7); + __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx); + __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx); + return npyv_combine_u32(abl, abh); +} +#define npyv_unzip_s32 npyv_unzip_u32 + +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ + npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1); + npyv_u64x2 r; + r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]); + return r; +} +#define npyv_unzip_s64 npyv_unzip_u64 + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) +{ + const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7); + __m256 abl = _mm256_permutevar8x32_ps(ab0, idx); + __m256 abh = _mm256_permutevar8x32_ps(ab1, idx); + return npyv_combine_f32(abl, abh); +} + +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ + npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1); + npyv_f64x2 r; + r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]); + return r; +} + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { @@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); } +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_s32 npyv_permi128_u32 + +#define npyv_permi128_u64(A, E0, E1) \ + _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1))) + +#define npyv_permi128_s64 npyv_permi128_u64 + +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_f64(A, E0, E1) \ + _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)) + #endif // _NPY_SIMD_AVX2_REORDER_H diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h index c0b2477f3..27e66b5e7 100644 --- a/numpy/core/src/common/simd/avx512/reorder.h +++ b/numpy/core/src/common/simd/avx512/reorder.h @@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b) return r; } +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ + npyv_u8x2 r; +#ifdef NPY_HAVE_AVX512VBMI + const __m512i idx_a = npyv_set_u8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, + 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126 + ); + const __m512i idx_b = npyv_set_u8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127 + ); + r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1); +#else + #ifdef NPY_HAVE_AVX512BW + const __m512i idx = npyv_set_u8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m512i abl = _mm512_shuffle_epi8(ab0, idx); + __m512i abh = _mm512_shuffle_epi8(ab1, idx); + #else + const __m256i idx = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx); + __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx); + __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx); + __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx); + __m512i abl = npyv512_combine_si256(abl_lo, abl_hi); + __m512i abh = npyv512_combine_si256(abh_lo, abh_hi); + #endif + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh); + r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh); +#endif + return r; +} +#define npyv_unzip_s8 npyv_unzip_u8 + +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ + npyv_u16x2 r; +#ifdef NPY_HAVE_AVX512BW + const __m512i idx_a = npyv_set_u16( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 + ); + const __m512i idx_b = npyv_set_u16( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + ); + r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1); +#else + const __m256i idx = _mm256_setr_epi8( + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15, + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 + ); + __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx); + __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx); + __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx); + __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx); + __m512i abl = npyv512_combine_si256(abl_lo, abl_hi); + __m512i abh = npyv512_combine_si256(abh_lo, abh_hi); + + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh); + r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh); +#endif + return r; +} +#define npyv_unzip_s16 npyv_unzip_u16 + +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + const __m512i idx_a = npyv_set_u32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ); + const __m512i idx_b = npyv_set_u32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + ); + npyv_u32x2 r; + r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1); + return r; +} +#define npyv_unzip_s32 npyv_unzip_u32 + +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + npyv_u64x2 r; + r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1); + return r; +} +#define npyv_unzip_s64 npyv_unzip_u64 + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) +{ + const __m512i idx_a = npyv_set_u32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ); + const __m512i idx_b = npyv_set_u32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + ); + npyv_f32x2 r; + r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + npyv_f64x2 r; + r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1); + return r; +} + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { @@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1)); } +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_s32 npyv_permi128_u32 + +#define npyv_permi128_u64(A, E0, E1) \ + _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1))) + +#define npyv_permi128_s64 npyv_permi128_u64 + +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_f64(A, E0, E1) \ + _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))) + #endif // _NPY_SIMD_AVX512_REORDER_H diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h index 50b06ed11..8bf68f5be 100644 --- a/numpy/core/src/common/simd/neon/reorder.h +++ b/numpy/core/src/common/simd/neon/reorder.h @@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32) NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) #endif -// interleave two vectors -#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ - NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ - { \ - T_VEC##x2 r; \ - r.val[0] = vzip1q_##SFX(a, b); \ - r.val[1] = vzip2q_##SFX(a, b); \ - return r; \ - } - +// interleave & deinterleave two vectors #ifdef __aarch64__ - NPYV_IMPL_NEON_ZIP(npyv_u8, u8) - NPYV_IMPL_NEON_ZIP(npyv_s8, s8) - NPYV_IMPL_NEON_ZIP(npyv_u16, u16) - NPYV_IMPL_NEON_ZIP(npyv_s16, s16) - NPYV_IMPL_NEON_ZIP(npyv_u32, u32) - NPYV_IMPL_NEON_ZIP(npyv_s32, s32) - NPYV_IMPL_NEON_ZIP(npyv_f32, f32) - NPYV_IMPL_NEON_ZIP(npyv_f64, f64) + #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vzip1q_##SFX(a, b); \ + r.val[1] = vzip2q_##SFX(a, b); \ + return r; \ + } \ + NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vuzp1q_##SFX(a, b); \ + r.val[1] = vuzp2q_##SFX(a, b); \ + return r; \ + } #else - #define npyv_zip_u8 vzipq_u8 - #define npyv_zip_s8 vzipq_s8 - #define npyv_zip_u16 vzipq_u16 - #define npyv_zip_s16 vzipq_s16 - #define npyv_zip_u32 vzipq_u32 - #define npyv_zip_s32 vzipq_s32 - #define npyv_zip_f32 vzipq_f32 + #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { return vzipq_##SFX(a, b); } \ + NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ + { return vuzpq_##SFX(a, b); } #endif + +NPYV_IMPL_NEON_ZIP(npyv_u8, u8) +NPYV_IMPL_NEON_ZIP(npyv_s8, s8) +NPYV_IMPL_NEON_ZIP(npyv_u16, u16) +NPYV_IMPL_NEON_ZIP(npyv_s16, s16) +NPYV_IMPL_NEON_ZIP(npyv_u32, u32) +NPYV_IMPL_NEON_ZIP(npyv_s32, s32) +NPYV_IMPL_NEON_ZIP(npyv_f32, f32) + #define npyv_zip_u64 npyv_combine_u64 #define npyv_zip_s64 npyv_combine_s64 +#define npyv_zip_f64 npyv_combine_f64 +#define npyv_unzip_u64 npyv_combine_u64 +#define npyv_unzip_s64 npyv_combine_s64 +#define npyv_unzip_f64 npyv_combine_f64 // Reverse elements of each 64-bit lane #define npyv_rev64_u8 vrev64q_u8 @@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) #define npyv_rev64_s32 vrev64q_s32 #define npyv_rev64_f32 vrev64q_f32 +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#ifdef __clang__ + #define npyv_permi128_u32(A, E0, E1, E2, E3) \ + __builtin_shufflevector(A, A, E0, E1, E2, E3) +#elif defined(__GNUC__) + #define npyv_permi128_u32(A, E0, E1, E2, E3) \ + __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3)) +#else + #define npyv_permi128_u32(A, E0, E1, E2, E3) \ + npyv_set_u32( \ + vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \ + vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3) \ + ) + #define npyv_permi128_s32(A, E0, E1, E2, E3) \ + npyv_set_s32( \ + vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \ + vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3) \ + ) + #define npyv_permi128_f32(A, E0, E1, E2, E3) \ + npyv_set_f32( \ + vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \ + vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3) \ + ) +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define npyv_permi128_s32 npyv_permi128_u32 + #define npyv_permi128_f32 npyv_permi128_u32 +#endif + +#ifdef __clang__ + #define npyv_permi128_u64(A, E0, E1) \ + __builtin_shufflevector(A, A, E0, E1) +#elif defined(__GNUC__) + #define npyv_permi128_u64(A, E0, E1) \ + __builtin_shuffle(A, npyv_set_u64(E0, E1)) +#else + #define npyv_permi128_u64(A, E0, E1) \ + npyv_set_u64( \ + vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1) \ + ) + #define npyv_permi128_s64(A, E0, E1) \ + npyv_set_s64( \ + vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1) \ + ) + #define npyv_permi128_f64(A, E0, E1) \ + npyv_set_f64( \ + vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1) \ + ) +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define npyv_permi128_s64 npyv_permi128_u64 + #define npyv_permi128_f64 npyv_permi128_u64 +#endif + +#if !NPY_SIMD_F64 + #undef npyv_permi128_f64 +#endif + #endif // _NPY_SIMD_NEON_REORDER_H diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h index d96ab9c56..9a57f6489 100644 --- a/numpy/core/src/common/simd/sse/reorder.h +++ b/numpy/core/src/common/simd/sse/reorder.h @@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64) NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps) NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd) +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ +#ifdef NPY_HAVE_SSSE3 + const __m128i idx = _mm_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m128i abl = _mm_shuffle_epi8(ab0, idx); + __m128i abh = _mm_shuffle_epi8(ab1, idx); + return npyv_combine_u8(abl, abh); +#else + __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1); + __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1); + __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e); + __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e); + __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be); + __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be); + npyv_u8x2 r; + r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8); + r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8); + return r; +#endif +} +#define npyv_unzip_s8 npyv_unzip_u8 + +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ +#ifdef NPY_HAVE_SSSE3 + const __m128i idx = _mm_setr_epi8( + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 + ); + __m128i abl = _mm_shuffle_epi8(ab0, idx); + __m128i abh = _mm_shuffle_epi8(ab1, idx); + return npyv_combine_u16(abl, abh); +#else + __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1); + __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1); + __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f); + __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f); + npyv_u16x2 r; + r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f); + r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f); + return r; +#endif +} +#define npyv_unzip_s16 npyv_unzip_u16 + +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0)); + return npyv_combine_u32(abl, abh); +} +#define npyv_unzip_s32 npyv_unzip_u32 + +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ return npyv_combine_u64(ab0, ab1); } +#define npyv_unzip_s64 npyv_unzip_u64 + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) +{ + npyv_f32x2 r; + r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0)); + r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1)); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ return npyv_combine_f64(ab0, ab1); } + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) { @@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); } +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_s32 npyv_permi128_u32 + +#define npyv_permi128_u64(A, E0, E1) \ + _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1))) + +#define npyv_permi128_s64 npyv_permi128_u64 + +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_f64(A, E0, E1) \ + _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0)) + #endif // _NPY_SIMD_SSE_REORDER_H diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h index b60b9287d..3910980a2 100644 --- a/numpy/core/src/common/simd/vec/reorder.h +++ b/numpy/core/src/common/simd/vec/reorder.h @@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64) #endif NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64) +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ + const npyv_u8 idx_even = npyv_set_u8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ); + const npyv_u8 idx_odd = npyv_set_u8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + ); + npyv_u8x2 r; + r.val[0] = vec_perm(ab0, ab1, idx_even); + r.val[1] = vec_perm(ab0, ab1, idx_odd); + return r; +} +NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1) +{ + npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1); + npyv_s8x2 r; + r.val[0] = (npyv_s8)ru.val[0]; + r.val[1] = (npyv_s8)ru.val[1]; + return r; +} +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ + const npyv_u8 idx_even = npyv_set_u8( + 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 + ); + const npyv_u8 idx_odd = npyv_set_u8( + 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 + ); + npyv_u16x2 r; + r.val[0] = vec_perm(ab0, ab1, idx_even); + r.val[1] = vec_perm(ab0, ab1, idx_odd); + return r; +} +NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1) +{ + npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1); + npyv_s16x2 r; + r.val[0] = (npyv_s16)ru.val[0]; + r.val[1] = (npyv_s16)ru.val[1]; + return r; +} +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + npyv_u32 m0 = vec_mergeh(ab0, ab1); + npyv_u32 m1 = vec_mergel(ab0, ab1); + npyv_u32 r0 = vec_mergeh(m0, m1); + npyv_u32 r1 = vec_mergel(m0, m1); + npyv_u32x2 r; + r.val[0] = r0; + r.val[1] = r1; + return r; +} +NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1) +{ + npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1); + npyv_s32x2 r; + r.val[0] = (npyv_s32)ru.val[0]; + r.val[1] = (npyv_s32)ru.val[1]; + return r; +} +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) + { + npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1); + npyv_f32x2 r; + r.val[0] = (npyv_f32)ru.val[0]; + r.val[1] = (npyv_f32)ru.val[1]; + return r; + } +#endif +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ return npyv_combine_u64(ab0, ab1); } +NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1) +{ return npyv_combine_s64(ab0, ab1); } +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ return npyv_combine_f64(ab0, ab1); } + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { @@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a) { return (npyv_f32)npyv_rev64_u32((npyv_u32)a); } #endif +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + vec_perm(A, A, npyv_set_u8( \ + (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \ + (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \ + (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \ + (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3 \ + )) +#define npyv_permi128_s32 npyv_permi128_u32 +#define npyv_permi128_f32 npyv_permi128_u32 + +#if defined(__IBMC__) || defined(vec_permi) + #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1)) +#else + #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1)) +#endif +#define npyv_permi128_s64 npyv_permi128_u64 +#define npyv_permi128_f64 npyv_permi128_u64 + #endif // _NPY_SIMD_VEC_REORDER_H |
