summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorRafael Cardoso Fernandes Sousa <rafaelcfsousa@ibm.com>2022-01-25 14:42:21 -0600
committerRafael Cardoso Fernandes Sousa <rafaelcfsousa@ibm.com>2022-02-08 10:30:17 -0600
commit2b45eac52ed67d20b6193d9a2d0f563b675f2df6 (patch)
tree155c611d0309cbd8e7e038fbc505c855c005795b /numpy
parent5c25a5a0dc7b59ee869e5c295f85ec1bd4accfa6 (diff)
downloadnumpy-2b45eac52ed67d20b6193d9a2d0f563b675f2df6.tar.gz
ENH: Optimize floor_divide for VSX4/Power10
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/common/simd/intdiv.h3
-rw-r--r--numpy/core/src/common/simd/vsx/arithmetic.h3
-rw-r--r--numpy/core/src/umath/loops_arithmetic.dispatch.c.src90
3 files changed, 86 insertions, 10 deletions
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
index 42f022c55..8b65b3a76 100644
--- a/numpy/core/src/common/simd/intdiv.h
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -46,9 +46,6 @@
* - For 64-bit division on Aarch64 and IBM/Power, we fall-back to the scalar division
* since emulating multiply-high is expensive and both architectures have very fast dividers.
*
- ** TODO:
- * - Add support for Power10(VSX4)
- *
***************************************************************
** Figure 4.1: Unsigned division by run–time invariant divisor
***************************************************************
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index eaca53620..299ef08b4 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -97,9 +97,6 @@
/***************************
* Integer Division
***************************/
-/***
- * TODO: Add support for VSX4(Power10)
- */
// See simd/intdiv.h for more clarification
// divide each unsigned 8-bit element by a precomputed divisor
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 1ddf7c3b1..fc170904f 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -1,7 +1,7 @@
/*@targets
** $maxopt baseline
** sse2 sse41 avx2 avx512f avx512_skx
- ** vsx2
+ ** vsx2 vsx4
** neon
**/
#define _UMATHMODULE
@@ -101,6 +101,82 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
}
/**end repeat**/
+#if defined(NPY_HAVE_VSX4)
+/*
+ * As Power10 only supports integer vector division for data of 32 bits or
+ * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
+ * vector division instruction, and then, convert the result back to npyv_u8.
+ */
+NPY_FINLINE npyv_u8 vsx4_div_u8(npyv_u8 a, npyv_u8 b)
+{
+ npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
+ npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
+ npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
+ npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
+ npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
+ npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
+ npyv_u32 divhi1 = vec_div(ahi.val[0], bhi.val[0]);
+ npyv_u32 divlo1 = vec_div(ahi.val[1], bhi.val[1]);
+ npyv_u32 divhi2 = vec_div(alo.val[0], blo.val[0]);
+ npyv_u32 divlo2 = vec_div(alo.val[1], blo.val[1]);
+ npyv_u16 reshi = (npyv_u16)vec_pack(divhi1, divlo1);
+ npyv_u16 reslo = (npyv_u16)vec_pack(divhi2, divlo2);
+ npyv_u8 res = (npyv_u8)vec_pack(reshi, reslo);
+ return res;
+}
+
+NPY_FINLINE npyv_u16 vsx4_div_u16(npyv_u16 a, npyv_u16 b)
+{
+ npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
+ npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
+ npyv_u32 divhi = vec_div(a_expand.val[0], b_expand.val[0]);
+ npyv_u32 divlo = vec_div(a_expand.val[1], b_expand.val[1]);
+ npyv_u16 res = (npyv_u16)vec_pack(divhi, divlo);
+ return res;
+}
+
+#define vsx4_div_u32 vec_div
+
+/**begin repeat
+ * Unsigned types
+ * #sfx = u8, u16, u32#
+ * #len = 8, 16, 32#
+ */
+static NPY_INLINE void
+vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
+{
+ npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
+ npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
+ npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2];
+ const int vstep = npyv_nlanes_@sfx@;
+ const npyv_@sfx@ zero = npyv_zero_@sfx@();
+
+ for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+ dst += vstep) {
+ npyv_@sfx@ a = npyv_load_@sfx@(src1);
+ npyv_@sfx@ b = npyv_load_@sfx@(src2);
+ npyv_@sfx@ c = vsx4_div_@sfx@(a, b);
+ npyv_store_@sfx@(dst, c);
+ if (NPY_UNLIKELY(vec_any_eq(b, zero))) {
+ npy_set_floatstatus_divbyzero();
+ }
+ }
+
+ for (; len > 0; --len, ++src1, ++src2, ++dst) {
+ const npyv_lanetype_@sfx@ a = *src1;
+ const npyv_lanetype_@sfx@ b = *src2;
+ if (NPY_UNLIKELY(b == 0)) {
+ npy_set_floatstatus_divbyzero();
+ *dst = 0;
+ } else{
+ *dst = a / b;
+ }
+ }
+ npyv_cleanup();
+}
+/**end repeat**/
+#endif // NPY_HAVE_VSX4
+
/**begin repeat
* Unsigned types
* #sfx = u8, u16, u32, u64#
@@ -128,7 +204,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
npyv_cleanup();
}
/**end repeat**/
-#endif
+#endif // NPY_SIMD
/********************************************************************************
** Defining ufunc inner functions
@@ -203,6 +279,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
* #STYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #vector = 1, 1, 1, 0, 0#
*/
#undef TO_SIMD_SFX
#if 0
@@ -218,8 +295,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
* because emulating multiply-high on these architectures is going to be expensive comparing
* to the native scalar dividers.
* Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
- * Power10(VSX4) is an exception here since it has native support for integer vector division,
- * note neither infrastructure nor NPYV has supported VSX4 yet.
+ * Power10(VSX4) is an exception here since it has native support for integer vector division.
*/
#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
#undef TO_SIMD_SFX
@@ -240,6 +316,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
*((@type@ *)iop1) = io1;
}
#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4) && @vector@
+ // both arguments are arrays of the same size
+ else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+ }
+#endif
// for contiguous block of memory, divisor is a scalar and not 0
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
(*(@type@ *)args[1]) != 0) {