summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2020-11-16 21:27:11 +0000
committerSayed Adel <seiko@imavr.com>2020-11-17 01:20:42 +0000
commit1923034242aa7d09a25c7cd70f0a27d280c68f71 (patch)
tree42b91ca6e59ef6530193036e9542ce628608d4bd /numpy/core
parentd6ecc97f55fd7ce3a5f3d4709938e7cc066900b8 (diff)
downloadnumpy-1923034242aa7d09a25c7cd70f0a27d280c68f71.tar.gz
ENH, SIMD: Add new NPYV intrinsics pack(0)
- add bitfield conversion for boolean vectors - add reverse elements of each 64-bit lane - add testing cases
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src67
-rw-r--r--numpy/core/src/common/simd/avx2/conversion.h22
-rw-r--r--numpy/core/src/common/simd/avx2/reorder.h32
-rw-r--r--numpy/core/src/common/simd/avx512/conversion.h31
-rw-r--r--numpy/core/src/common/simd/avx512/reorder.h56
-rw-r--r--numpy/core/src/common/simd/neon/conversion.h66
-rw-r--r--numpy/core/src/common/simd/neon/reorder.h9
-rw-r--r--numpy/core/src/common/simd/sse/conversion.h21
-rw-r--r--numpy/core/src/common/simd/sse/reorder.h41
-rw-r--r--numpy/core/src/common/simd/vsx/conversion.h22
-rw-r--r--numpy/core/src/common/simd/vsx/reorder.h41
-rw-r--r--numpy/core/tests/test_simd.py40
12 files changed, 409 insertions, 39 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 18c383871..e3dbcdece 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -9,9 +9,9 @@
#include "_simd_arg.inc"
#include "_simd_easyintrin.inc"
-/*************************************************************************
- * Defining NPYV intrinsics as module functions
- *************************************************************************/
+//#########################################################################
+//## Defining NPYV intrinsics as module functions
+//#########################################################################
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
@@ -22,6 +22,7 @@
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
@@ -227,7 +228,6 @@ err:
/**end repeat1**/
#endif // @ncont_sup@
-
/***************************
* Misc
***************************/
@@ -289,6 +289,10 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
/**end repeat1**/
+#if @rev64_sup@
+SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@)
+#endif
+
/***************************
* Operators
***************************/
@@ -370,14 +374,26 @@ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
#endif // simd_sup
/**end repeat**/
-/***************************
+/*************************************************************************
* Variant
- ***************************/
+ ************************************************************************/
SIMD_IMPL_INTRIN_0N(cleanup)
-
/*************************************************************************
- * Attach module functions
- *************************************************************************/
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Conversions
+ ***************************/
+// Convert mask vector to integer bitfield
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
+/**end repeat**/
+
+//#########################################################################
+//## Attach module functions
+//#########################################################################
static PyMethodDef simd__intrinsics_methods[] = {
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
@@ -389,6 +405,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
@@ -416,7 +433,6 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#endif // ncont_sup
-
/***************************
* Misc
***************************/
@@ -444,8 +460,9 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
-SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
-SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+#if @rev64_sup@
+SIMD_INTRIN_DEF(rev64_@sfx@)
+#endif
/***************************
* Operators
@@ -517,23 +534,35 @@ SIMD_INTRIN_DEF(sum_@sfx@)
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#endif
-
#endif // simd_sup
/**end repeat**/
+/*************************************************************************
+ * Variant
+ ************************************************************************/
+SIMD_INTRIN_DEF(cleanup)
+/*************************************************************************
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
/***************************
- * Variant
+ * Conversions
***************************/
-SIMD_INTRIN_DEF(cleanup)
-/***************************/
+// Convert mask vector to integer bitfield
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_INTRIN_DEF(tobits_@bsfx@)
+/**end repeat**/
+
+/************************************************************************/
{NULL, NULL, 0, NULL}
}; // PyMethodDef
#endif // NPY_SIMD
-/*************************************************************************
- * Defining a separate module for each target
- *************************************************************************/
+//#########################################################################
+//## Defining a separate module for each target
+//#########################################################################
NPY_VISIBILITY_HIDDEN PyObject *
NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
{
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
index 9fd86016d..f72678b54 100644
--- a/numpy/core/src/common/simd/avx2/conversion.h
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -14,8 +14,8 @@
#define npyv_cvt_s32_b32(A) A
#define npyv_cvt_u64_b64(A) A
#define npyv_cvt_s64_b64(A) A
-#define npyv_cvt_f32_b32(A) _mm256_castsi256_ps(A)
-#define npyv_cvt_f64_b64(A) _mm256_castsi256_pd(A)
+#define npyv_cvt_f32_b32 _mm256_castsi256_ps
+#define npyv_cvt_f64_b64 _mm256_castsi256_pd
// convert integer types to mask types
#define npyv_cvt_b8_u8(BL) BL
@@ -26,7 +26,21 @@
#define npyv_cvt_b32_s32(BL) BL
#define npyv_cvt_b64_u64(BL) BL
#define npyv_cvt_b64_s64(BL) BL
-#define npyv_cvt_b32_f32(BL) _mm256_castps_si256(BL)
-#define npyv_cvt_b64_f64(BL) _mm256_castpd_si256(BL)
+#define npyv_cvt_b32_f32 _mm256_castps_si256
+#define npyv_cvt_b64_f64 _mm256_castpd_si256
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{ return (npy_uint32)_mm256_movemask_epi8(a); }
+
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+ __m128i pack = _mm_packs_epi16(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1));
+ return (npy_uint16)_mm_movemask_epi8(pack);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint8)_mm256_movemask_ps(_mm256_castsi256_ps(a)); }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); }
#endif // _NPY_SIMD_AVX2_CVT_H
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 5a9e68e32..4d6ec8f75 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,4 +94,36 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
return npyv_combine_f64(ab0, ab1);
}
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+ );
+ return _mm256_shuffle_epi8(a, idx);
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+ );
+ return _mm256_shuffle_epi8(a, idx);
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+ return _mm256_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+ return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
#endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index 0f7e27de3..bd92abccd 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -51,4 +51,35 @@
#define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A))
#define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A))
+// convert boolean vectors to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+#ifdef NPY_HAVE_AVX512BW_MASK
+ return (npy_uint64)_cvtmask64_u64(a);
+#elif NPY_HAVE_AVX512BW
+ return (npy_uint64)a;
+#else
+ int mask_lo = _mm256_movemask_epi8(npyv512_lower_si256(a));
+ int mask_hi = _mm256_movemask_epi8(npyv512_higher_si256(a));
+ return (unsigned)mask_lo | ((npy_uint64)(unsigned)mask_hi << 32);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+#ifdef NPY_HAVE_AVX512BW_MASK
+ return (npy_uint32)_cvtmask32_u32(a);
+#elif NPY_HAVE_AVX512BW
+ return (npy_uint32)a;
+#else
+ __m256i pack = _mm256_packs_epi16(
+ npyv512_lower_si256(a), npyv512_higher_si256(a)
+ );
+ return (npy_uint32)_mm256_movemask_epi8(_mm256_permute4x64_epi64(pack, _MM_SHUFFLE(3, 1, 2, 0)));
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint16)a; }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)a; }
+
#endif // _NPY_SIMD_AVX512_CVT_H
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index cdbae7aac..f043004ec 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,4 +167,60 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
return r;
}
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#ifdef NPY_HAVE_AVX512BW
+ const __m512i idx = npyv_set_u8(
+ 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+ );
+ return _mm512_shuffle_epi8(a, idx);
+#else
+ const __m256i idx = _mm256_setr_epi8(
+ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ );
+ __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a), idx);
+ __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx);
+ return npyv512_combine_si256(lo, hi);
+#endif
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+#ifdef NPY_HAVE_AVX512BW
+ const __m512i idx = npyv_set_u8(
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+ );
+ return _mm512_shuffle_epi8(a, idx);
+#else
+ const __m256i idx = _mm256_setr_epi8(
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+ );
+ __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a), idx);
+ __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx);
+ return npyv512_combine_si256(lo, hi);
+#endif
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+ return _mm512_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+ return _mm512_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
#endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
index b286931d1..f9840b1cb 100644
--- a/numpy/core/src/common/simd/neon/conversion.h
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -7,26 +7,68 @@
// convert boolean vectors to integer vectors
#define npyv_cvt_u8_b8(A) A
-#define npyv_cvt_s8_b8(A) vreinterpretq_s8_u8(A)
+#define npyv_cvt_s8_b8 vreinterpretq_s8_u8
#define npyv_cvt_u16_b16(A) A
-#define npyv_cvt_s16_b16(A) vreinterpretq_s16_u16(A)
+#define npyv_cvt_s16_b16 vreinterpretq_s16_u16
#define npyv_cvt_u32_b32(A) A
-#define npyv_cvt_s32_b32(A) vreinterpretq_s32_u32(A)
+#define npyv_cvt_s32_b32 vreinterpretq_s32_u32
#define npyv_cvt_u64_b64(A) A
-#define npyv_cvt_s64_b64(A) vreinterpretq_s64_u64(A)
-#define npyv_cvt_f32_b32(A) vreinterpretq_f32_u32(A)
-#define npyv_cvt_f64_b64(A) vreinterpretq_f64_u64(A)
+#define npyv_cvt_s64_b64 vreinterpretq_s64_u64
+#define npyv_cvt_f32_b32 vreinterpretq_f32_u32
+#define npyv_cvt_f64_b64 vreinterpretq_f64_u64
// convert integer vectors to boolean vectors
#define npyv_cvt_b8_u8(BL) BL
-#define npyv_cvt_b8_s8(BL) vreinterpretq_u8_s8(BL)
+#define npyv_cvt_b8_s8 vreinterpretq_u8_s8
#define npyv_cvt_b16_u16(BL) BL
-#define npyv_cvt_b16_s16(BL) vreinterpretq_u16_s16(BL)
+#define npyv_cvt_b16_s16 vreinterpretq_u16_s16
#define npyv_cvt_b32_u32(BL) BL
-#define npyv_cvt_b32_s32(BL) vreinterpretq_u32_s32(BL)
+#define npyv_cvt_b32_s32 vreinterpretq_u32_s32
#define npyv_cvt_b64_u64(BL) BL
-#define npyv_cvt_b64_s64(BL) vreinterpretq_u64_s64(BL)
-#define npyv_cvt_b32_f32(BL) vreinterpretq_u32_f32(BL)
-#define npyv_cvt_b64_f64(BL) vreinterpretq_u64_f64(BL)
+#define npyv_cvt_b64_s64 vreinterpretq_u64_s64
+#define npyv_cvt_b32_f32 vreinterpretq_u32_f32
+#define npyv_cvt_b64_f64 vreinterpretq_u64_f64
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+ const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
+ npyv_u8 seq_scale = vandq_u8(a, scale);
+#if NPY_SIMD_F64
+ npy_uint8 sumlo = vaddv_u8(vget_low_u8(seq_scale));
+ npy_uint8 sumhi = vaddv_u8(vget_high_u8(seq_scale));
+ return sumlo + ((int)sumhi << 8);
+#else
+ npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(seq_scale)));
+ return vgetq_lane_u64(sumh, 0) + ((int)vgetq_lane_u64(sumh, 1) << 8);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+ const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128);
+ npyv_u16 seq_scale = vandq_u16(a, scale);
+#if NPY_SIMD_F64
+ return vaddvq_u16(seq_scale);
+#else
+ npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(seq_scale));
+ return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{
+ const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8);
+ npyv_u32 seq_scale = vandq_u32(a, scale);
+#if NPY_SIMD_F64
+ return vaddvq_u32(seq_scale);
+#else
+ npyv_u64 sumh = vpaddlq_u32(seq_scale);
+ return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{
+ npyv_u64 bit = vshrq_n_u64(a, 63);
+ return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1);
+}
#endif // _NPY_SIMD_NEON_CVT_H
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 712a77982..50b06ed11 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -107,4 +107,13 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#define npyv_zip_u64 npyv_combine_u64
#define npyv_zip_s64 npyv_combine_s64
+// Reverse elements of each 64-bit lane
+#define npyv_rev64_u8 vrev64q_u8
+#define npyv_rev64_s8 vrev64q_s8
+#define npyv_rev64_u16 vrev64q_u16
+#define npyv_rev64_s16 vrev64q_s16
+#define npyv_rev64_u32 vrev64q_u32
+#define npyv_rev64_s32 vrev64q_s32
+#define npyv_rev64_f32 vrev64q_f32
+
#endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
index ea9660d13..ab4beea96 100644
--- a/numpy/core/src/common/simd/sse/conversion.h
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -14,8 +14,8 @@
#define npyv_cvt_s32_b32(BL) BL
#define npyv_cvt_u64_b64(BL) BL
#define npyv_cvt_s64_b64(BL) BL
-#define npyv_cvt_f32_b32(BL) _mm_castsi128_ps(BL)
-#define npyv_cvt_f64_b64(BL) _mm_castsi128_pd(BL)
+#define npyv_cvt_f32_b32 _mm_castsi128_ps
+#define npyv_cvt_f64_b64 _mm_castsi128_pd
// convert integer types to mask types
#define npyv_cvt_b8_u8(A) A
@@ -26,7 +26,20 @@
#define npyv_cvt_b32_s32(A) A
#define npyv_cvt_b64_u64(A) A
#define npyv_cvt_b64_s64(A) A
-#define npyv_cvt_b32_f32(A) _mm_castps_si128(A)
-#define npyv_cvt_b64_f64(A) _mm_castpd_si128(A)
+#define npyv_cvt_b32_f32 _mm_castps_si128
+#define npyv_cvt_b64_f64 _mm_castpd_si128
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{ return (npy_uint16)_mm_movemask_epi8(a); }
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+ __m128i pack = _mm_packs_epi16(a, a);
+ return (npy_uint8)_mm_movemask_epi8(pack);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint8)_mm_movemask_ps(_mm_castsi128_ps(a)); }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); }
#endif // _NPY_SIMD_SSE_CVT_H
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index 3f68b4ad7..d96ab9c56 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,4 +81,45 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+ );
+ return _mm_shuffle_epi8(a, idx);
+#else
+ __m128i lo = _mm_shufflelo_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
+ return _mm_shufflehi_epi16(lo, _MM_SHUFFLE(0, 1, 2, 3));
+#endif
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+ );
+ return _mm_shuffle_epi8(a, idx);
+#else
+ __m128i rev16 = npyv_rev64_u16(a);
+ // swap 8bit pairs
+ return _mm_or_si128(_mm_slli_epi16(rev16, 8), _mm_srli_epi16(rev16, 8));
+#endif
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+ return _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+ return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
#endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
index 6ed135990..5803e1cdd 100644
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -29,4 +29,26 @@
#define npyv_cvt_b32_f32(A) ((npyv_b32) A)
#define npyv_cvt_b64_f64(A) ((npyv_b64) A)
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+ const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0);
+ return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+ const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0);
+ return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{
+ const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0);
+ return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{
+ npyv_u64 bit = npyv_shri_u64((npyv_u64)a, 63);
+ return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1;
+}
+
#endif // _NPY_SIMD_VSX_CVT_H
diff --git a/numpy/core/src/common/simd/vsx/reorder.h b/numpy/core/src/common/simd/vsx/reorder.h
index bfb9115fa..6533e5093 100644
--- a/numpy/core/src/common/simd/vsx/reorder.h
+++ b/numpy/core/src/common/simd/vsx/reorder.h
@@ -62,4 +62,45 @@ NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s64, s64)
NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f32, f32)
NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f64, f64)
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#if defined(NPY_HAVE_VSX3) && ((defined(__GNUC__) && __GNUC__ > 7) || defined(__IBMC__))
+ return (npyv_u8)vec_revb((npyv_u64)a);
+#elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+ npyv_u8 ret;
+ __asm__ ("xxbrd %x0,%x1" : "=wa" (ret) : "wa" (a));
+ return ret;
+#else
+ const npyv_u8 idx = npyv_set_u8(
+ 7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+ );
+ return vec_perm(a, a, idx);
+#endif
+}
+NPY_FINLINE npyv_s8 npyv_rev64_s8(npyv_s8 a)
+{ return (npyv_s8)npyv_rev64_u8((npyv_u8)a); }
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+ const npyv_u8 idx = npyv_set_u8(
+ 6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+ );
+ return vec_perm(a, a, idx);
+}
+NPY_FINLINE npyv_s16 npyv_rev64_s16(npyv_s16 a)
+{ return (npyv_s16)npyv_rev64_u16((npyv_u16)a); }
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+ const npyv_u8 idx = npyv_set_u8(
+ 4, 5, 6, 7, 0, 1, 2, 3,/*64*/12, 13, 14, 15, 8, 9, 10, 11
+ );
+ return vec_perm(a, a, idx);
+}
+NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
+{ return (npyv_s32)npyv_rev64_u32((npyv_u32)a); }
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
+
#endif // _NPY_SIMD_VSX_REORDER_H
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 13e8d5ede..196003cdd 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -92,6 +92,32 @@ class _Test_Utility:
v = self.npyv.setall_u32(0x7fc00000)
return self.npyv.reinterpret_f32_u32(v)[0]
+class _SIMD_BOOL(_Test_Utility):
+ """
+ To test all boolean vector types at once
+ """
+ def _data(self, start=None, count=None, reverse=False):
+ nlanes = getattr(self.npyv, "nlanes_u" + self.sfx[1:])
+ true_mask = self._true_mask()
+ rng = range(nlanes)
+ if reverse:
+ rng = reversed(rng)
+ return [true_mask if x % 2 else 0 for x in rng]
+
+ def _load_b(self, data):
+ len_str = self.sfx[1:]
+ load = getattr(self.npyv, "load_u" + len_str)
+ cvt = getattr(self.npyv, f"cvt_b{len_str}_u{len_str}")
+ return cvt(load(data))
+
+ def test_tobits(self):
+ data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)])
+ for data in (self._data(), self._data(reverse=True)):
+ vdata = self._load_b(data)
+ data_bits = data2bits(data)
+ tobits = bin(self.tobits(vdata))
+ assert tobits == bin(data_bits)
+
class _SIMD_INT(_Test_Utility):
"""
To test all integer vector types at once
@@ -459,6 +485,18 @@ class _SIMD_ALL(_Test_Utility):
vzip = self.zip(vdata_a, vdata_b)
assert vzip == (data_zipl, data_ziph)
+ def test_reorder_rev64(self):
+ # Reverse elements of each 64-bit lane
+ ssize = self._scalar_size()
+ if ssize == 64:
+ return
+ data_rev64 = [
+ y for x in range(0, self.nlanes, 64//ssize)
+ for y in reversed(range(x, x + 64//ssize))
+ ]
+ rev64 = self.rev64(self.load(range(self.nlanes)))
+ assert rev64 == data_rev64
+
def test_operators_comparison(self):
if self._is_fp():
data_a = self._data()
@@ -594,10 +632,12 @@ class _SIMD_ALL(_Test_Utility):
vsum = self.sum(vdata)
assert vsum == data_sum
+bool_sfx = ("b8", "b16", "b32", "b64")
int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
fp_sfx = ("f32", "f64")
all_sfx = int_sfx + fp_sfx
tests_registry = {
+ bool_sfx: _SIMD_BOOL,
int_sfx : _SIMD_INT,
fp_sfx : _SIMD_FP,
all_sfx : _SIMD_ALL