diff options
-rw-r--r-- | benchmarks/benchmarks/bench_ufunc.py | 13 | ||||
-rw-r--r-- | doc/release/upcoming_changes/20821.improvement.rst | 5 | ||||
-rw-r--r-- | numpy/core/src/common/simd/intdiv.h | 3 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/arithmetic.h | 17 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 90 |
5 files changed, 118 insertions, 10 deletions
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py index b036581e1..5aff1f56d 100644 --- a/benchmarks/benchmarks/bench_ufunc.py +++ b/benchmarks/benchmarks/bench_ufunc.py @@ -150,6 +150,19 @@ class CustomScalarFloorDivideInt(Benchmark): def time_floor_divide_int(self, dtype, divisor): self.x // divisor +class CustomArrayFloorDivideInt(Benchmark): + params = (np.sctypes['int'] + np.sctypes['uint'], [100, 10000, 1000000]) + param_names = ['dtype', 'size'] + + def setup(self, dtype, size): + iinfo = np.iinfo(dtype) + self.x = np.random.randint( + iinfo.min, iinfo.max, size=size, dtype=dtype) + self.y = np.random.randint(2, 32, size=size, dtype=dtype) + + def time_floor_divide_int(self, dtype, size): + self.x // self.y + class Scalar(Benchmark): def setup(self): diff --git a/doc/release/upcoming_changes/20821.improvement.rst b/doc/release/upcoming_changes/20821.improvement.rst new file mode 100644 index 000000000..e3ce857a6 --- /dev/null +++ b/doc/release/upcoming_changes/20821.improvement.rst @@ -0,0 +1,5 @@ +Add support for VSX4/Power10 +---------------------------------------------- +With VSX4/Power10 enablement, the new instructions available in +Power ISA 3.1 can be used to accelerate some NumPy operations, +e.g., floor_divide, modulo, etc. diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h index 42f022c55..8b65b3a76 100644 --- a/numpy/core/src/common/simd/intdiv.h +++ b/numpy/core/src/common/simd/intdiv.h @@ -46,9 +46,6 @@ * - For 64-bit division on Aarch64 and IBM/Power, we fall-back to the scalar division * since emulating multiply-high is expensive and both architectures have very fast dividers. * - ** TODO: - * - Add support for Power10(VSX4) - * *************************************************************** ** Figure 4.1: Unsigned division by run–time invariant divisor *************************************************************** diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index eaca53620..01dbf5480 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -97,9 +97,6 @@ /*************************** * Integer Division ***************************/ -/*** - * TODO: Add support for VSX4(Power10) - */ // See simd/intdiv.h for more clarification // divide each unsigned 8-bit element by a precomputed divisor NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) @@ -172,6 +169,10 @@ NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) // divide each unsigned 32-bit element by a precomputed divisor NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) { +#if defined(NPY_HAVE_VSX4) + // high part of unsigned multiplication + npyv_u32 mulhi = vec_mulh(a, divisor.val[0]); +#else #if defined(__GNUC__) && __GNUC__ < 8 // Doubleword integer wide multiplication supported by GCC 8+ npyv_u64 mul_even, mul_odd; @@ -184,6 +185,7 @@ NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) #endif // high part of unsigned multiplication npyv_u32 mulhi = vec_mergeo((npyv_u32)mul_even, (npyv_u32)mul_odd); +#endif // floor(x/d) = (((a-mulhi) >> sh1) + mulhi) >> sh2 npyv_u32 q = vec_sub(a, mulhi); q = vec_sr(q, divisor.val[1]); @@ -194,6 +196,10 @@ NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) // divide each signed 32-bit element by a precomputed divisor (round towards zero) NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) { +#if defined(NPY_HAVE_VSX4) + // high part of signed multiplication + npyv_s32 mulhi = vec_mulh(a, divisor.val[0]); +#else #if defined(__GNUC__) && __GNUC__ < 8 // Doubleword integer wide multiplication supported by GCC8+ npyv_s64 mul_even, mul_odd; @@ -206,6 +212,7 @@ NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) #endif // high part of signed multiplication npyv_s32 mulhi = vec_mergeo((npyv_s32)mul_even, (npyv_s32)mul_odd); +#endif // q = ((a + mulhi) >> sh1) - XSIGN(a) // trunc(a/d) = (q ^ dsign) - dsign npyv_s32 q = vec_sra(vec_add(a, mulhi), (npyv_u32)divisor.val[1]); @@ -216,8 +223,12 @@ NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) // divide each unsigned 64-bit element by a precomputed divisor NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) { +#if defined(NPY_HAVE_VSX4) + return vec_div(a, divisor.val[0]); +#else const npy_uint64 d = vec_extract(divisor.val[0], 0); return npyv_set_u64(vec_extract(a, 0) / d, vec_extract(a, 1) / d); +#endif } // divide each signed 64-bit element by a precomputed divisor (round towards zero) NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index 1ddf7c3b1..fc170904f 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -1,7 +1,7 @@ /*@targets ** $maxopt baseline ** sse2 sse41 avx2 avx512f avx512_skx - ** vsx2 + ** vsx2 vsx4 ** neon **/ #define _UMATHMODULE @@ -101,6 +101,82 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) } /**end repeat**/ +#if defined(NPY_HAVE_VSX4) +/* + * As Power10 only supports integer vector division for data of 32 bits or + * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer + * vector division instruction, and then, convert the result back to npyv_u8. + */ +NPY_FINLINE npyv_u8 vsx4_div_u8(npyv_u8 a, npyv_u8 b) +{ + npyv_u16x2 a_expand = npyv_expand_u16_u8(a); + npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]); + npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]); + npyv_u16x2 b_expand = npyv_expand_u16_u8(b); + npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]); + npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]); + npyv_u32 divhi1 = vec_div(ahi.val[0], bhi.val[0]); + npyv_u32 divlo1 = vec_div(ahi.val[1], bhi.val[1]); + npyv_u32 divhi2 = vec_div(alo.val[0], blo.val[0]); + npyv_u32 divlo2 = vec_div(alo.val[1], blo.val[1]); + npyv_u16 reshi = (npyv_u16)vec_pack(divhi1, divlo1); + npyv_u16 reslo = (npyv_u16)vec_pack(divhi2, divlo2); + npyv_u8 res = (npyv_u8)vec_pack(reshi, reslo); + return res; +} + +NPY_FINLINE npyv_u16 vsx4_div_u16(npyv_u16 a, npyv_u16 b) +{ + npyv_u32x2 a_expand = npyv_expand_u32_u16(a); + npyv_u32x2 b_expand = npyv_expand_u32_u16(b); + npyv_u32 divhi = vec_div(a_expand.val[0], b_expand.val[0]); + npyv_u32 divlo = vec_div(a_expand.val[1], b_expand.val[1]); + npyv_u16 res = (npyv_u16)vec_pack(divhi, divlo); + return res; +} + +#define vsx4_div_u32 vec_div + +/**begin repeat + * Unsigned types + * #sfx = u8, u16, u32# + * #len = 8, 16, 32# + */ +static NPY_INLINE void +vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@ zero = npyv_zero_@sfx@(); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src1); + npyv_@sfx@ b = npyv_load_@sfx@(src2); + npyv_@sfx@ c = vsx4_div_@sfx@(a, b); + npyv_store_@sfx@(dst, c); + if (NPY_UNLIKELY(vec_any_eq(b, zero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst) { + const npyv_lanetype_@sfx@ a = *src1; + const npyv_lanetype_@sfx@ b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst = 0; + } else{ + *dst = a / b; + } + } + npyv_cleanup(); +} +/**end repeat**/ +#endif // NPY_HAVE_VSX4 + /**begin repeat * Unsigned types * #sfx = u8, u16, u32, u64# @@ -128,7 +204,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) npyv_cleanup(); } /**end repeat**/ -#endif +#endif // NPY_SIMD /******************************************************************************** ** Defining ufunc inner functions @@ -203,6 +279,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG# + * #vector = 1, 1, 1, 0, 0# */ #undef TO_SIMD_SFX #if 0 @@ -218,8 +295,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) * because emulating multiply-high on these architectures is going to be expensive comparing * to the native scalar dividers. * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles. - * Power10(VSX4) is an exception here since it has native support for integer vector division, - * note neither infrastructure nor NPYV has supported VSX4 yet. + * Power10(VSX4) is an exception here since it has native support for integer vector division. */ #if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) #undef TO_SIMD_SFX @@ -240,6 +316,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) *((@type@ *)iop1) = io1; } #if NPY_SIMD && defined(TO_SIMD_SFX) +#if defined(NPY_HAVE_VSX4) && @vector@ + // both arguments are arrays of the same size + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]); + } +#endif // for contiguous block of memory, divisor is a scalar and not 0 else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && (*(@type@ *)args[1]) != 0) { |