ENH, SIMD: Add new NPYV intrinsics pack(0)

- add bitfield conversion for boolean vectors - add reverse elements of each 64-bit lane - add testing cases
author: Sayed Adel <seiko@imavr.com> 2020-11-16 21:27:11 +0000
committer: Sayed Adel <seiko@imavr.com> 2020-11-17 01:20:42 +0000
commit: 1923034242aa7d09a25c7cd70f0a27d280c68f71 (patch)
tree: 42b91ca6e59ef6530193036e9542ce628608d4bd /numpy/core
parent: d6ecc97f55fd7ce3a5f3d4709938e7cc066900b8 (diff)
download: numpy-1923034242aa7d09a25c7cd70f0a27d280c68f71.tar.gz
12 files changed, 409 insertions, 39 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 18c383871..e3dbcdece 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -9,9 +9,9 @@
 #include "_simd_arg.inc"
 #include "_simd_easyintrin.inc"
 
-/*************************************************************************
- * Defining NPYV intrinsics as module functions
- *************************************************************************/
+//#########################################################################
+//## Defining NPYV intrinsics as module functions
+//#########################################################################
 /**begin repeat
  * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
@@ -22,6 +22,7 @@
  * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #sum_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
  * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
@@ -227,7 +228,6 @@ err:
 /**end repeat1**/
 #endif // @ncont_sup@
 
-
 /***************************
  * Misc
  ***************************/
@@ -289,6 +289,10 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
 SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
 /**end repeat1**/
 
+#if @rev64_sup@
+SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@)
+#endif
+
 /***************************
  * Operators
  ***************************/
@@ -370,14 +374,26 @@ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
 
 #endif // simd_sup
 /**end repeat**/
-/***************************
+/*************************************************************************
  * Variant
- ***************************/
+ ************************************************************************/
 SIMD_IMPL_INTRIN_0N(cleanup)
-
 /*************************************************************************
- * Attach module functions
- *************************************************************************/
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Conversions
+ ***************************/
+// Convert mask vector to integer bitfield
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
+/**end repeat**/
+
+//#########################################################################
+//## Attach module functions
+//#########################################################################
 static PyMethodDef simd__intrinsics_methods[] = {
 /**begin repeat
  * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
@@ -389,6 +405,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
  * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #sum_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
  * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
@@ -416,7 +433,6 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 #endif // ncont_sup
 
-
 /***************************
  * Misc
  ***************************/
@@ -444,8 +460,9 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
-SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
-SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+#if @rev64_sup@
+SIMD_INTRIN_DEF(rev64_@sfx@)
+#endif
 
 /***************************
  * Operators
@@ -517,23 +534,35 @@ SIMD_INTRIN_DEF(sum_@sfx@)
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 #endif
-
 #endif // simd_sup
 /**end repeat**/
+/*************************************************************************
+ * Variant
+ ************************************************************************/
+SIMD_INTRIN_DEF(cleanup)
 
+/*************************************************************************
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
 /***************************
- * Variant
+ * Conversions
  ***************************/
-SIMD_INTRIN_DEF(cleanup)
-/***************************/
+// Convert mask vector to integer bitfield
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_INTRIN_DEF(tobits_@bsfx@)
+/**end repeat**/
+
+/************************************************************************/
 {NULL, NULL, 0, NULL}
 }; // PyMethodDef
 
 #endif // NPY_SIMD
 
-/*************************************************************************
- * Defining a separate module for each target
- *************************************************************************/
+//#########################################################################
+//## Defining a separate module for each target
+//#########################################################################
 NPY_VISIBILITY_HIDDEN PyObject *
 NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
 {
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
index 9fd86016d..f72678b54 100644
--- a/numpy/core/src/common/simd/avx2/conversion.h
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -14,8 +14,8 @@
 #define npyv_cvt_s32_b32(A) A
 #define npyv_cvt_u64_b64(A) A
 #define npyv_cvt_s64_b64(A) A
-#define npyv_cvt_f32_b32(A) _mm256_castsi256_ps(A)
-#define npyv_cvt_f64_b64(A) _mm256_castsi256_pd(A)
+#define npyv_cvt_f32_b32 _mm256_castsi256_ps
+#define npyv_cvt_f64_b64 _mm256_castsi256_pd
 
 // convert integer types to mask types
 #define npyv_cvt_b8_u8(BL)   BL
@@ -26,7 +26,21 @@
 #define npyv_cvt_b32_s32(BL) BL
 #define npyv_cvt_b64_u64(BL) BL
 #define npyv_cvt_b64_s64(BL) BL
-#define npyv_cvt_b32_f32(BL) _mm256_castps_si256(BL)
-#define npyv_cvt_b64_f64(BL) _mm256_castpd_si256(BL)
+#define npyv_cvt_b32_f32 _mm256_castps_si256
+#define npyv_cvt_b64_f64 _mm256_castpd_si256
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{ return (npy_uint32)_mm256_movemask_epi8(a); }
+
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    __m128i pack = _mm_packs_epi16(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1));
+    return (npy_uint16)_mm_movemask_epi8(pack);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint8)_mm256_movemask_ps(_mm256_castsi256_ps(a)); }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); }
 
 #endif // _NPY_SIMD_AVX2_CVT_H
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 5a9e68e32..4d6ec8f75 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,4 +94,36 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
     return npyv_combine_f64(ab0, ab1);
 }
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return _mm256_shuffle_epi8(a, idx);
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return _mm256_shuffle_epi8(a, idx);
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    return _mm256_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+    return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
 #endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index 0f7e27de3..bd92abccd 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -51,4 +51,35 @@
 #define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A))
 #define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A))
 
+// convert boolean vectors to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+#ifdef NPY_HAVE_AVX512BW_MASK
+    return (npy_uint64)_cvtmask64_u64(a);
+#elif NPY_HAVE_AVX512BW
+    return (npy_uint64)a;
+#else
+    int mask_lo = _mm256_movemask_epi8(npyv512_lower_si256(a));
+    int mask_hi = _mm256_movemask_epi8(npyv512_higher_si256(a));
+    return (unsigned)mask_lo | ((npy_uint64)(unsigned)mask_hi << 32);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+#ifdef NPY_HAVE_AVX512BW_MASK
+    return (npy_uint32)_cvtmask32_u32(a);
+#elif NPY_HAVE_AVX512BW
+    return (npy_uint32)a;
+#else
+    __m256i pack = _mm256_packs_epi16(
+        npyv512_lower_si256(a), npyv512_higher_si256(a)
+    );
+    return (npy_uint32)_mm256_movemask_epi8(_mm256_permute4x64_epi64(pack, _MM_SHUFFLE(3, 1, 2, 0)));
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint16)a; }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)a; }
+
 #endif // _NPY_SIMD_AVX512_CVT_H
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index cdbae7aac..f043004ec 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,4 +167,60 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
     return r;
 }
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i idx = npyv_set_u8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return _mm512_shuffle_epi8(a, idx);
+#else
+    const __m256i idx = _mm256_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+    );
+    __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a),  idx);
+    __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx);
+    return npyv512_combine_si256(lo, hi);
+#endif
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i idx = npyv_set_u8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return _mm512_shuffle_epi8(a, idx);
+#else
+    const __m256i idx = _mm256_setr_epi8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a),  idx);
+    __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx);
+    return npyv512_combine_si256(lo, hi);
+#endif
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    return _mm512_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+    return _mm512_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
 #endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
index b286931d1..f9840b1cb 100644
--- a/numpy/core/src/common/simd/neon/conversion.h
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -7,26 +7,68 @@
 
 // convert boolean vectors to integer vectors
 #define npyv_cvt_u8_b8(A)   A
-#define npyv_cvt_s8_b8(A)   vreinterpretq_s8_u8(A)
+#define npyv_cvt_s8_b8   vreinterpretq_s8_u8
 #define npyv_cvt_u16_b16(A) A
-#define npyv_cvt_s16_b16(A) vreinterpretq_s16_u16(A)
+#define npyv_cvt_s16_b16 vreinterpretq_s16_u16
 #define npyv_cvt_u32_b32(A) A
-#define npyv_cvt_s32_b32(A) vreinterpretq_s32_u32(A)
+#define npyv_cvt_s32_b32 vreinterpretq_s32_u32
 #define npyv_cvt_u64_b64(A) A
-#define npyv_cvt_s64_b64(A) vreinterpretq_s64_u64(A)
-#define npyv_cvt_f32_b32(A) vreinterpretq_f32_u32(A)
-#define npyv_cvt_f64_b64(A) vreinterpretq_f64_u64(A)
+#define npyv_cvt_s64_b64 vreinterpretq_s64_u64
+#define npyv_cvt_f32_b32 vreinterpretq_f32_u32
+#define npyv_cvt_f64_b64 vreinterpretq_f64_u64
 
 // convert integer vectors to boolean vectors
 #define npyv_cvt_b8_u8(BL)   BL
-#define npyv_cvt_b8_s8(BL)   vreinterpretq_u8_s8(BL)
+#define npyv_cvt_b8_s8   vreinterpretq_u8_s8
 #define npyv_cvt_b16_u16(BL) BL
-#define npyv_cvt_b16_s16(BL) vreinterpretq_u16_s16(BL)
+#define npyv_cvt_b16_s16 vreinterpretq_u16_s16
 #define npyv_cvt_b32_u32(BL) BL
-#define npyv_cvt_b32_s32(BL) vreinterpretq_u32_s32(BL)
+#define npyv_cvt_b32_s32 vreinterpretq_u32_s32
 #define npyv_cvt_b64_u64(BL) BL
-#define npyv_cvt_b64_s64(BL) vreinterpretq_u64_s64(BL)
-#define npyv_cvt_b32_f32(BL) vreinterpretq_u32_f32(BL)
-#define npyv_cvt_b64_f64(BL) vreinterpretq_u64_f64(BL)
+#define npyv_cvt_b64_s64 vreinterpretq_u64_s64
+#define npyv_cvt_b32_f32 vreinterpretq_u32_f32
+#define npyv_cvt_b64_f64 vreinterpretq_u64_f64
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+    const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
+    npyv_u8 seq_scale = vandq_u8(a, scale);
+#if NPY_SIMD_F64
+    npy_uint8 sumlo = vaddv_u8(vget_low_u8(seq_scale));
+    npy_uint8 sumhi = vaddv_u8(vget_high_u8(seq_scale));
+    return sumlo + ((int)sumhi << 8);
+#else
+    npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(seq_scale)));
+    return vgetq_lane_u64(sumh, 0) + ((int)vgetq_lane_u64(sumh, 1) << 8);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128);
+    npyv_u16 seq_scale = vandq_u16(a, scale);
+#if NPY_SIMD_F64
+    return vaddvq_u16(seq_scale);
+#else
+    npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(seq_scale));
+    return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{
+    const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8);
+    npyv_u32 seq_scale = vandq_u32(a, scale);
+#if NPY_SIMD_F64
+    return vaddvq_u32(seq_scale);
+#else
+    npyv_u64 sumh = vpaddlq_u32(seq_scale);
+    return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{
+    npyv_u64 bit = vshrq_n_u64(a, 63);
+    return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1);
+}
 
 #endif // _NPY_SIMD_NEON_CVT_H
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 712a77982..50b06ed11 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -107,4 +107,13 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
 #define npyv_zip_u64 npyv_combine_u64
 #define npyv_zip_s64 npyv_combine_s64
 
+// Reverse elements of each 64-bit lane
+#define npyv_rev64_u8  vrev64q_u8
+#define npyv_rev64_s8  vrev64q_s8
+#define npyv_rev64_u16 vrev64q_u16
+#define npyv_rev64_s16 vrev64q_s16
+#define npyv_rev64_u32 vrev64q_u32
+#define npyv_rev64_s32 vrev64q_s32
+#define npyv_rev64_f32 vrev64q_f32
+
 #endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
index ea9660d13..ab4beea96 100644
--- a/numpy/core/src/common/simd/sse/conversion.h
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -14,8 +14,8 @@
 #define npyv_cvt_s32_b32(BL) BL
 #define npyv_cvt_u64_b64(BL) BL
 #define npyv_cvt_s64_b64(BL) BL
-#define npyv_cvt_f32_b32(BL) _mm_castsi128_ps(BL)
-#define npyv_cvt_f64_b64(BL) _mm_castsi128_pd(BL)
+#define npyv_cvt_f32_b32 _mm_castsi128_ps
+#define npyv_cvt_f64_b64 _mm_castsi128_pd
 
 // convert integer types to mask types
 #define npyv_cvt_b8_u8(A)   A
@@ -26,7 +26,20 @@
 #define npyv_cvt_b32_s32(A) A
 #define npyv_cvt_b64_u64(A) A
 #define npyv_cvt_b64_s64(A) A
-#define npyv_cvt_b32_f32(A) _mm_castps_si128(A)
-#define npyv_cvt_b64_f64(A) _mm_castpd_si128(A)
+#define npyv_cvt_b32_f32 _mm_castps_si128
+#define npyv_cvt_b64_f64 _mm_castpd_si128
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{ return (npy_uint16)_mm_movemask_epi8(a); }
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    __m128i pack = _mm_packs_epi16(a, a);
+    return (npy_uint8)_mm_movemask_epi8(pack);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint8)_mm_movemask_ps(_mm_castsi128_ps(a)); }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); }
 
 #endif // _NPY_SIMD_SSE_CVT_H
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index 3f68b4ad7..d96ab9c56 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,4 +81,45 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
 NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
 NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return _mm_shuffle_epi8(a, idx);
+#else
+    __m128i lo = _mm_shufflelo_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
+    return _mm_shufflehi_epi16(lo, _MM_SHUFFLE(0, 1, 2, 3));
+#endif
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return _mm_shuffle_epi8(a, idx);
+#else
+    __m128i rev16 = npyv_rev64_u16(a);
+    // swap 8bit pairs
+    return _mm_or_si128(_mm_slli_epi16(rev16, 8), _mm_srli_epi16(rev16, 8));
+#endif
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    return _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+    return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
 #endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
index 6ed135990..5803e1cdd 100644
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -29,4 +29,26 @@
 #define npyv_cvt_b32_f32(A) ((npyv_b32) A)
 #define npyv_cvt_b64_f64(A) ((npyv_b64) A)
 
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+    const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0);
+    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0);
+    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{
+    const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0);
+    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{
+    npyv_u64 bit = npyv_shri_u64((npyv_u64)a, 63);
+    return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1;
+}
+
 #endif // _NPY_SIMD_VSX_CVT_H
diff --git a/numpy/core/src/common/simd/vsx/reorder.h b/numpy/core/src/common/simd/vsx/reorder.h
index bfb9115fa..6533e5093 100644
--- a/numpy/core/src/common/simd/vsx/reorder.h
+++ b/numpy/core/src/common/simd/vsx/reorder.h
@@ -62,4 +62,45 @@ NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s64, s64)
 NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f32, f32)
 NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f64, f64)
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#if defined(NPY_HAVE_VSX3) && ((defined(__GNUC__) && __GNUC__ > 7) || defined(__IBMC__))
+    return (npyv_u8)vec_revb((npyv_u64)a);
+#elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+    npyv_u8 ret;
+    __asm__ ("xxbrd %x0,%x1" : "=wa" (ret) : "wa" (a));
+    return ret;
+#else
+    const npyv_u8 idx = npyv_set_u8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return vec_perm(a, a, idx);
+#endif
+}
+NPY_FINLINE npyv_s8 npyv_rev64_s8(npyv_s8 a)
+{ return (npyv_s8)npyv_rev64_u8((npyv_u8)a); }
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+    const npyv_u8 idx = npyv_set_u8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return vec_perm(a, a, idx);
+}
+NPY_FINLINE npyv_s16 npyv_rev64_s16(npyv_s16 a)
+{ return (npyv_s16)npyv_rev64_u16((npyv_u16)a); }
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    const npyv_u8 idx = npyv_set_u8(
+        4, 5, 6, 7, 0, 1, 2, 3,/*64*/12, 13, 14, 15, 8, 9, 10, 11
+    );
+    return vec_perm(a, a, idx);
+}
+NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
+{ return (npyv_s32)npyv_rev64_u32((npyv_u32)a); }
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
+
 #endif // _NPY_SIMD_VSX_REORDER_H
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 13e8d5ede..196003cdd 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -92,6 +92,32 @@ class _Test_Utility:
         v = self.npyv.setall_u32(0x7fc00000)
         return self.npyv.reinterpret_f32_u32(v)[0]
 
+class _SIMD_BOOL(_Test_Utility):
+    """
+    To test all boolean vector types at once
+    """
+    def _data(self, start=None, count=None, reverse=False):
+        nlanes = getattr(self.npyv, "nlanes_u" + self.sfx[1:])
+        true_mask = self._true_mask()
+        rng = range(nlanes)
+        if reverse:
+            rng = reversed(rng)
+        return [true_mask if x % 2 else 0 for x in rng]
+
+    def _load_b(self, data):
+        len_str = self.sfx[1:]
+        load = getattr(self.npyv, "load_u" + len_str)
+        cvt = getattr(self.npyv, f"cvt_b{len_str}_u{len_str}")
+        return cvt(load(data))
+
+    def test_tobits(self):
+        data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)])
+        for data in (self._data(), self._data(reverse=True)):
+            vdata = self._load_b(data)
+            data_bits = data2bits(data)
+            tobits = bin(self.tobits(vdata))
+            assert tobits == bin(data_bits)
+
 class _SIMD_INT(_Test_Utility):
     """
     To test all integer vector types at once
@@ -459,6 +485,18 @@ class _SIMD_ALL(_Test_Utility):
         vzip  = self.zip(vdata_a, vdata_b)
         assert vzip == (data_zipl, data_ziph)
 
+    def test_reorder_rev64(self):
+        # Reverse elements of each 64-bit lane
+        ssize = self._scalar_size()
+        if ssize == 64:
+            return
+        data_rev64 = [
+            y for x in range(0, self.nlanes, 64//ssize)
+              for y in reversed(range(x, x + 64//ssize))
+        ]
+        rev64 = self.rev64(self.load(range(self.nlanes)))
+        assert rev64 == data_rev64
+
     def test_operators_comparison(self):
         if self._is_fp():
             data_a = self._data()
@@ -594,10 +632,12 @@ class _SIMD_ALL(_Test_Utility):
         vsum = self.sum(vdata)
         assert vsum == data_sum
 
+bool_sfx = ("b8", "b16", "b32", "b64")
 int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
 fp_sfx  = ("f32", "f64")
 all_sfx = int_sfx + fp_sfx
 tests_registry = {
+    bool_sfx: _SIMD_BOOL,
     int_sfx : _SIMD_INT,
     fp_sfx  : _SIMD_FP,
     all_sfx : _SIMD_ALL
author	Sayed Adel <seiko@imavr.com>	2020-11-16 21:27:11 +0000
committer	Sayed Adel <seiko@imavr.com>	2020-11-17 01:20:42 +0000
commit	1923034242aa7d09a25c7cd70f0a27d280c68f71 (patch)
tree	42b91ca6e59ef6530193036e9542ce628608d4bd /numpy/core
parent	d6ecc97f55fd7ce3a5f3d4709938e7cc066900b8 (diff)
download	numpy-1923034242aa7d09a25c7cd70f0a27d280c68f71.tar.gz