diff options
author | Sayed Adel <seiko@imavr.com> | 2020-11-16 21:27:11 +0000 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2020-11-17 01:20:42 +0000 |
commit | 1923034242aa7d09a25c7cd70f0a27d280c68f71 (patch) | |
tree | 42b91ca6e59ef6530193036e9542ce628608d4bd /numpy/core | |
parent | d6ecc97f55fd7ce3a5f3d4709938e7cc066900b8 (diff) | |
download | numpy-1923034242aa7d09a25c7cd70f0a27d280c68f71.tar.gz |
ENH, SIMD: Add new NPYV intrinsics pack(0)
- add bitfield conversion for boolean vectors
- add reverse elements of each 64-bit lane
- add testing cases
Diffstat (limited to 'numpy/core')
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 67 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/conversion.h | 22 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/reorder.h | 32 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/conversion.h | 31 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/reorder.h | 56 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/conversion.h | 66 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/reorder.h | 9 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/conversion.h | 21 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/reorder.h | 41 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/conversion.h | 22 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/reorder.h | 41 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 40 |
12 files changed, 409 insertions, 39 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 18c383871..e3dbcdece 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -9,9 +9,9 @@ #include "_simd_arg.inc" #include "_simd_easyintrin.inc" -/************************************************************************* - * Defining NPYV intrinsics as module functions - *************************************************************************/ +//######################################################################### +//## Defining NPYV intrinsics as module functions +//######################################################################### /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# @@ -22,6 +22,7 @@ * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# @@ -227,7 +228,6 @@ err: /**end repeat1**/ #endif // @ncont_sup@ - /*************************** * Misc ***************************/ @@ -289,6 +289,10 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@) /**end repeat1**/ +#if @rev64_sup@ +SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@) +#endif + /*************************** * Operators ***************************/ @@ -370,14 +374,26 @@ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@) #endif // simd_sup /**end repeat**/ -/*************************** +/************************************************************************* * Variant - ***************************/ + ************************************************************************/ SIMD_IMPL_INTRIN_0N(cleanup) - /************************************************************************* - * Attach module functions - *************************************************************************/ + * A special section for boolean intrinsics outside the main repeater + ************************************************************************/ +/*************************** + * Conversions + ***************************/ +// Convert mask vector to integer bitfield +/**begin repeat + * #bsfx = b8, b16, b32, b64# + */ +SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@) +/**end repeat**/ + +//######################################################################### +//## Attach module functions +//######################################################################### static PyMethodDef simd__intrinsics_methods[] = { /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# @@ -389,6 +405,7 @@ static PyMethodDef simd__intrinsics_methods[] = { * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# @@ -416,7 +433,6 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif // ncont_sup - /*************************** * Misc ***************************/ @@ -444,8 +460,9 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ -SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@) -SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@) +#if @rev64_sup@ +SIMD_INTRIN_DEF(rev64_@sfx@) +#endif /*************************** * Operators @@ -517,23 +534,35 @@ SIMD_INTRIN_DEF(sum_@sfx@) SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif - #endif // simd_sup /**end repeat**/ +/************************************************************************* + * Variant + ************************************************************************/ +SIMD_INTRIN_DEF(cleanup) +/************************************************************************* + * A special section for boolean intrinsics outside the main repeater + ************************************************************************/ /*************************** - * Variant + * Conversions ***************************/ -SIMD_INTRIN_DEF(cleanup) -/***************************/ +// Convert mask vector to integer bitfield +/**begin repeat + * #bsfx = b8, b16, b32, b64# + */ +SIMD_INTRIN_DEF(tobits_@bsfx@) +/**end repeat**/ + +/************************************************************************/ {NULL, NULL, 0, NULL} }; // PyMethodDef #endif // NPY_SIMD -/************************************************************************* - * Defining a separate module for each target - *************************************************************************/ +//######################################################################### +//## Defining a separate module for each target +//######################################################################### NPY_VISIBILITY_HIDDEN PyObject * NPY_CPU_DISPATCH_CURFX(simd_create_module)(void) { diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h index 9fd86016d..f72678b54 100644 --- a/numpy/core/src/common/simd/avx2/conversion.h +++ b/numpy/core/src/common/simd/avx2/conversion.h @@ -14,8 +14,8 @@ #define npyv_cvt_s32_b32(A) A #define npyv_cvt_u64_b64(A) A #define npyv_cvt_s64_b64(A) A -#define npyv_cvt_f32_b32(A) _mm256_castsi256_ps(A) -#define npyv_cvt_f64_b64(A) _mm256_castsi256_pd(A) +#define npyv_cvt_f32_b32 _mm256_castsi256_ps +#define npyv_cvt_f64_b64 _mm256_castsi256_pd // convert integer types to mask types #define npyv_cvt_b8_u8(BL) BL @@ -26,7 +26,21 @@ #define npyv_cvt_b32_s32(BL) BL #define npyv_cvt_b64_u64(BL) BL #define npyv_cvt_b64_s64(BL) BL -#define npyv_cvt_b32_f32(BL) _mm256_castps_si256(BL) -#define npyv_cvt_b64_f64(BL) _mm256_castpd_si256(BL) +#define npyv_cvt_b32_f32 _mm256_castps_si256 +#define npyv_cvt_b64_f64 _mm256_castpd_si256 + +// convert boolean vector to integer bitfield +NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) +{ return (npy_uint32)_mm256_movemask_epi8(a); } + +NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) +{ + __m128i pack = _mm_packs_epi16(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)); + return (npy_uint16)_mm_movemask_epi8(pack); +} +NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) +{ return (npy_uint8)_mm256_movemask_ps(_mm256_castsi256_ps(a)); } +NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) +{ return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); } #endif // _NPY_SIMD_AVX2_CVT_H diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h index 5a9e68e32..4d6ec8f75 100644 --- a/numpy/core/src/common/simd/avx2/reorder.h +++ b/numpy/core/src/common/simd/avx2/reorder.h @@ -94,4 +94,36 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b) return npyv_combine_f64(ab0, ab1); } +// Reverse elements of each 64-bit lane +NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) +{ + const __m256i idx = _mm256_setr_epi8( + 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8 + ); + return _mm256_shuffle_epi8(a, idx); +} +#define npyv_rev64_s8 npyv_rev64_u8 + +NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) +{ + const __m256i idx = _mm256_setr_epi8( + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9, + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9 + ); + return _mm256_shuffle_epi8(a, idx); +} +#define npyv_rev64_s16 npyv_rev64_u16 + +NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) +{ + return _mm256_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1)); +} +#define npyv_rev64_s32 npyv_rev64_u32 + +NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) +{ + return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); +} + #endif // _NPY_SIMD_AVX2_REORDER_H diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h index 0f7e27de3..bd92abccd 100644 --- a/numpy/core/src/common/simd/avx512/conversion.h +++ b/numpy/core/src/common/simd/avx512/conversion.h @@ -51,4 +51,35 @@ #define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A)) #define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A)) +// convert boolean vectors to integer bitfield +NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) +{ +#ifdef NPY_HAVE_AVX512BW_MASK + return (npy_uint64)_cvtmask64_u64(a); +#elif NPY_HAVE_AVX512BW + return (npy_uint64)a; +#else + int mask_lo = _mm256_movemask_epi8(npyv512_lower_si256(a)); + int mask_hi = _mm256_movemask_epi8(npyv512_higher_si256(a)); + return (unsigned)mask_lo | ((npy_uint64)(unsigned)mask_hi << 32); +#endif +} +NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) +{ +#ifdef NPY_HAVE_AVX512BW_MASK + return (npy_uint32)_cvtmask32_u32(a); +#elif NPY_HAVE_AVX512BW + return (npy_uint32)a; +#else + __m256i pack = _mm256_packs_epi16( + npyv512_lower_si256(a), npyv512_higher_si256(a) + ); + return (npy_uint32)_mm256_movemask_epi8(_mm256_permute4x64_epi64(pack, _MM_SHUFFLE(3, 1, 2, 0))); +#endif +} +NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) +{ return (npy_uint16)a; } +NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) +{ return (npy_uint8)a; } + #endif // _NPY_SIMD_AVX512_CVT_H diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h index cdbae7aac..f043004ec 100644 --- a/numpy/core/src/common/simd/avx512/reorder.h +++ b/numpy/core/src/common/simd/avx512/reorder.h @@ -167,4 +167,60 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b) return r; } +// Reverse elements of each 64-bit lane +NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) +{ +#ifdef NPY_HAVE_AVX512BW + const __m512i idx = npyv_set_u8( + 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8 + ); + return _mm512_shuffle_epi8(a, idx); +#else + const __m256i idx = _mm256_setr_epi8( + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + ); + __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a), idx); + __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx); + return npyv512_combine_si256(lo, hi); +#endif +} +#define npyv_rev64_s8 npyv_rev64_u8 + +NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) +{ +#ifdef NPY_HAVE_AVX512BW + const __m512i idx = npyv_set_u8( + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9, + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9, + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9, + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9 + ); + return _mm512_shuffle_epi8(a, idx); +#else + const __m256i idx = _mm256_setr_epi8( + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9, + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9 + ); + __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a), idx); + __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx); + return npyv512_combine_si256(lo, hi); +#endif +} +#define npyv_rev64_s16 npyv_rev64_u16 + +NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) +{ + return _mm512_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1)); +} +#define npyv_rev64_s32 npyv_rev64_u32 + +NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) +{ + return _mm512_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); +} + #endif // _NPY_SIMD_AVX512_REORDER_H diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h index b286931d1..f9840b1cb 100644 --- a/numpy/core/src/common/simd/neon/conversion.h +++ b/numpy/core/src/common/simd/neon/conversion.h @@ -7,26 +7,68 @@ // convert boolean vectors to integer vectors #define npyv_cvt_u8_b8(A) A -#define npyv_cvt_s8_b8(A) vreinterpretq_s8_u8(A) +#define npyv_cvt_s8_b8 vreinterpretq_s8_u8 #define npyv_cvt_u16_b16(A) A -#define npyv_cvt_s16_b16(A) vreinterpretq_s16_u16(A) +#define npyv_cvt_s16_b16 vreinterpretq_s16_u16 #define npyv_cvt_u32_b32(A) A -#define npyv_cvt_s32_b32(A) vreinterpretq_s32_u32(A) +#define npyv_cvt_s32_b32 vreinterpretq_s32_u32 #define npyv_cvt_u64_b64(A) A -#define npyv_cvt_s64_b64(A) vreinterpretq_s64_u64(A) -#define npyv_cvt_f32_b32(A) vreinterpretq_f32_u32(A) -#define npyv_cvt_f64_b64(A) vreinterpretq_f64_u64(A) +#define npyv_cvt_s64_b64 vreinterpretq_s64_u64 +#define npyv_cvt_f32_b32 vreinterpretq_f32_u32 +#define npyv_cvt_f64_b64 vreinterpretq_f64_u64 // convert integer vectors to boolean vectors #define npyv_cvt_b8_u8(BL) BL -#define npyv_cvt_b8_s8(BL) vreinterpretq_u8_s8(BL) +#define npyv_cvt_b8_s8 vreinterpretq_u8_s8 #define npyv_cvt_b16_u16(BL) BL -#define npyv_cvt_b16_s16(BL) vreinterpretq_u16_s16(BL) +#define npyv_cvt_b16_s16 vreinterpretq_u16_s16 #define npyv_cvt_b32_u32(BL) BL -#define npyv_cvt_b32_s32(BL) vreinterpretq_u32_s32(BL) +#define npyv_cvt_b32_s32 vreinterpretq_u32_s32 #define npyv_cvt_b64_u64(BL) BL -#define npyv_cvt_b64_s64(BL) vreinterpretq_u64_s64(BL) -#define npyv_cvt_b32_f32(BL) vreinterpretq_u32_f32(BL) -#define npyv_cvt_b64_f64(BL) vreinterpretq_u64_f64(BL) +#define npyv_cvt_b64_s64 vreinterpretq_u64_s64 +#define npyv_cvt_b32_f32 vreinterpretq_u32_f32 +#define npyv_cvt_b64_f64 vreinterpretq_u64_f64 + +// convert boolean vector to integer bitfield +NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) +{ + const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); + npyv_u8 seq_scale = vandq_u8(a, scale); +#if NPY_SIMD_F64 + npy_uint8 sumlo = vaddv_u8(vget_low_u8(seq_scale)); + npy_uint8 sumhi = vaddv_u8(vget_high_u8(seq_scale)); + return sumlo + ((int)sumhi << 8); +#else + npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(seq_scale))); + return vgetq_lane_u64(sumh, 0) + ((int)vgetq_lane_u64(sumh, 1) << 8); +#endif +} +NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) +{ + const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128); + npyv_u16 seq_scale = vandq_u16(a, scale); +#if NPY_SIMD_F64 + return vaddvq_u16(seq_scale); +#else + npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(seq_scale)); + return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); +#endif +} +NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) +{ + const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8); + npyv_u32 seq_scale = vandq_u32(a, scale); +#if NPY_SIMD_F64 + return vaddvq_u32(seq_scale); +#else + npyv_u64 sumh = vpaddlq_u32(seq_scale); + return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); +#endif +} +NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) +{ + npyv_u64 bit = vshrq_n_u64(a, 63); + return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1); +} #endif // _NPY_SIMD_NEON_CVT_H diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h index 712a77982..50b06ed11 100644 --- a/numpy/core/src/common/simd/neon/reorder.h +++ b/numpy/core/src/common/simd/neon/reorder.h @@ -107,4 +107,13 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) #define npyv_zip_u64 npyv_combine_u64 #define npyv_zip_s64 npyv_combine_s64 +// Reverse elements of each 64-bit lane +#define npyv_rev64_u8 vrev64q_u8 +#define npyv_rev64_s8 vrev64q_s8 +#define npyv_rev64_u16 vrev64q_u16 +#define npyv_rev64_s16 vrev64q_s16 +#define npyv_rev64_u32 vrev64q_u32 +#define npyv_rev64_s32 vrev64q_s32 +#define npyv_rev64_f32 vrev64q_f32 + #endif // _NPY_SIMD_NEON_REORDER_H diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h index ea9660d13..ab4beea96 100644 --- a/numpy/core/src/common/simd/sse/conversion.h +++ b/numpy/core/src/common/simd/sse/conversion.h @@ -14,8 +14,8 @@ #define npyv_cvt_s32_b32(BL) BL #define npyv_cvt_u64_b64(BL) BL #define npyv_cvt_s64_b64(BL) BL -#define npyv_cvt_f32_b32(BL) _mm_castsi128_ps(BL) -#define npyv_cvt_f64_b64(BL) _mm_castsi128_pd(BL) +#define npyv_cvt_f32_b32 _mm_castsi128_ps +#define npyv_cvt_f64_b64 _mm_castsi128_pd // convert integer types to mask types #define npyv_cvt_b8_u8(A) A @@ -26,7 +26,20 @@ #define npyv_cvt_b32_s32(A) A #define npyv_cvt_b64_u64(A) A #define npyv_cvt_b64_s64(A) A -#define npyv_cvt_b32_f32(A) _mm_castps_si128(A) -#define npyv_cvt_b64_f64(A) _mm_castpd_si128(A) +#define npyv_cvt_b32_f32 _mm_castps_si128 +#define npyv_cvt_b64_f64 _mm_castpd_si128 + +// convert boolean vector to integer bitfield +NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) +{ return (npy_uint16)_mm_movemask_epi8(a); } +NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) +{ + __m128i pack = _mm_packs_epi16(a, a); + return (npy_uint8)_mm_movemask_epi8(pack); +} +NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) +{ return (npy_uint8)_mm_movemask_ps(_mm_castsi128_ps(a)); } +NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) +{ return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); } #endif // _NPY_SIMD_SSE_CVT_H diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h index 3f68b4ad7..d96ab9c56 100644 --- a/numpy/core/src/common/simd/sse/reorder.h +++ b/numpy/core/src/common/simd/sse/reorder.h @@ -81,4 +81,45 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64) NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps) NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd) +// Reverse elements of each 64-bit lane +NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) +{ +#ifdef NPY_HAVE_SSSE3 + const __m128i idx = _mm_setr_epi8( + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9 + ); + return _mm_shuffle_epi8(a, idx); +#else + __m128i lo = _mm_shufflelo_epi16(a, _MM_SHUFFLE(0, 1, 2, 3)); + return _mm_shufflehi_epi16(lo, _MM_SHUFFLE(0, 1, 2, 3)); +#endif +} +#define npyv_rev64_s16 npyv_rev64_u16 + +NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) +{ +#ifdef NPY_HAVE_SSSE3 + const __m128i idx = _mm_setr_epi8( + 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8 + ); + return _mm_shuffle_epi8(a, idx); +#else + __m128i rev16 = npyv_rev64_u16(a); + // swap 8bit pairs + return _mm_or_si128(_mm_slli_epi16(rev16, 8), _mm_srli_epi16(rev16, 8)); +#endif +} +#define npyv_rev64_s8 npyv_rev64_u8 + +NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) +{ + return _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1)); +} +#define npyv_rev64_s32 npyv_rev64_u32 + +NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); +} + #endif // _NPY_SIMD_SSE_REORDER_H diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h index 6ed135990..5803e1cdd 100644 --- a/numpy/core/src/common/simd/vsx/conversion.h +++ b/numpy/core/src/common/simd/vsx/conversion.h @@ -29,4 +29,26 @@ #define npyv_cvt_b32_f32(A) ((npyv_b32) A) #define npyv_cvt_b64_f64(A) ((npyv_b64) A) +// convert boolean vector to integer bitfield +NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) +{ + const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0); + return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2); +} +NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) +{ + const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0); + return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2); +} +NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) +{ + const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0); + return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2); +} +NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) +{ + npyv_u64 bit = npyv_shri_u64((npyv_u64)a, 63); + return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1; +} + #endif // _NPY_SIMD_VSX_CVT_H diff --git a/numpy/core/src/common/simd/vsx/reorder.h b/numpy/core/src/common/simd/vsx/reorder.h index bfb9115fa..6533e5093 100644 --- a/numpy/core/src/common/simd/vsx/reorder.h +++ b/numpy/core/src/common/simd/vsx/reorder.h @@ -62,4 +62,45 @@ NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s64, s64) NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f32, f32) NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f64, f64) +// Reverse elements of each 64-bit lane +NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) +{ +#if defined(NPY_HAVE_VSX3) && ((defined(__GNUC__) && __GNUC__ > 7) || defined(__IBMC__)) + return (npyv_u8)vec_revb((npyv_u64)a); +#elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM) + npyv_u8 ret; + __asm__ ("xxbrd %x0,%x1" : "=wa" (ret) : "wa" (a)); + return ret; +#else + const npyv_u8 idx = npyv_set_u8( + 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8 + ); + return vec_perm(a, a, idx); +#endif +} +NPY_FINLINE npyv_s8 npyv_rev64_s8(npyv_s8 a) +{ return (npyv_s8)npyv_rev64_u8((npyv_u8)a); } + +NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) +{ + const npyv_u8 idx = npyv_set_u8( + 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9 + ); + return vec_perm(a, a, idx); +} +NPY_FINLINE npyv_s16 npyv_rev64_s16(npyv_s16 a) +{ return (npyv_s16)npyv_rev64_u16((npyv_u16)a); } + +NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) +{ + const npyv_u8 idx = npyv_set_u8( + 4, 5, 6, 7, 0, 1, 2, 3,/*64*/12, 13, 14, 15, 8, 9, 10, 11 + ); + return vec_perm(a, a, idx); +} +NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a) +{ return (npyv_s32)npyv_rev64_u32((npyv_u32)a); } +NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) +{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); } + #endif // _NPY_SIMD_VSX_REORDER_H diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 13e8d5ede..196003cdd 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -92,6 +92,32 @@ class _Test_Utility: v = self.npyv.setall_u32(0x7fc00000) return self.npyv.reinterpret_f32_u32(v)[0] +class _SIMD_BOOL(_Test_Utility): + """ + To test all boolean vector types at once + """ + def _data(self, start=None, count=None, reverse=False): + nlanes = getattr(self.npyv, "nlanes_u" + self.sfx[1:]) + true_mask = self._true_mask() + rng = range(nlanes) + if reverse: + rng = reversed(rng) + return [true_mask if x % 2 else 0 for x in rng] + + def _load_b(self, data): + len_str = self.sfx[1:] + load = getattr(self.npyv, "load_u" + len_str) + cvt = getattr(self.npyv, f"cvt_b{len_str}_u{len_str}") + return cvt(load(data)) + + def test_tobits(self): + data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)]) + for data in (self._data(), self._data(reverse=True)): + vdata = self._load_b(data) + data_bits = data2bits(data) + tobits = bin(self.tobits(vdata)) + assert tobits == bin(data_bits) + class _SIMD_INT(_Test_Utility): """ To test all integer vector types at once @@ -459,6 +485,18 @@ class _SIMD_ALL(_Test_Utility): vzip = self.zip(vdata_a, vdata_b) assert vzip == (data_zipl, data_ziph) + def test_reorder_rev64(self): + # Reverse elements of each 64-bit lane + ssize = self._scalar_size() + if ssize == 64: + return + data_rev64 = [ + y for x in range(0, self.nlanes, 64//ssize) + for y in reversed(range(x, x + 64//ssize)) + ] + rev64 = self.rev64(self.load(range(self.nlanes))) + assert rev64 == data_rev64 + def test_operators_comparison(self): if self._is_fp(): data_a = self._data() @@ -594,10 +632,12 @@ class _SIMD_ALL(_Test_Utility): vsum = self.sum(vdata) assert vsum == data_sum +bool_sfx = ("b8", "b16", "b32", "b64") int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64") fp_sfx = ("f32", "f64") all_sfx = int_sfx + fp_sfx tests_registry = { + bool_sfx: _SIMD_BOOL, int_sfx : _SIMD_INT, fp_sfx : _SIMD_FP, all_sfx : _SIMD_ALL |