diff options
| author | Ganesh Kathiresan <ganesh3597@gmail.com> | 2021-03-20 16:22:06 +0530 |
|---|---|---|
| committer | Ganesh Kathiresan <ganesh3597@gmail.com> | 2021-03-20 16:22:13 +0530 |
| commit | bbb143646cbaad2866ed401ca3c795f083285f78 (patch) | |
| tree | 28220dc64efe91ff41898c56a76b29f005473792 /numpy/core/src | |
| parent | 71e84dcd2ec1a59b6426f05b9095a3a2fd51c01d (diff) | |
| download | numpy-bbb143646cbaad2866ed401ca3c795f083285f78.tar.gz | |
SIMD, MAINT: Refined kernel and inner ufunc functions
Diffstat (limited to 'numpy/core/src')
| -rw-r--r-- | numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 109 |
1 files changed, 43 insertions, 66 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index 0e68f1b7b..a012d50dd 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -1,6 +1,6 @@ /*@targets ** $maxopt baseline - ** sse2 sse41 avx2 avx512_skx + ** sse2 sse41 avx2 avx512f avx512_skx ** vsx2 ** neon **/ @@ -12,26 +12,26 @@ #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" -#include<signal.h> // Provides the various *_LOOP macros #include "fast_loop_macros.h" //############################################################################### -//## Unsigned Integers +//## Division //############################################################################### /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ -#ifdef NPY_SIMD +#if NPY_SIMD /**begin repeat * #sfx = u8, u16, u32, u64# */ - -static void simd_divide_by_scalar_contig_contig_@sfx@ -(npyv_lanetype_@sfx@ *src, const npyv_lanetype_@sfx@ scalar, npyv_lanetype_@sfx@ *dst, - int len) +static NPY_INLINE void +simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { - const int vstep = npyv_nlanes_@sfx@; + npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; + const int vstep = npyv_nlanes_@sfx@; const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { @@ -39,92 +39,69 @@ static void simd_divide_by_scalar_contig_contig_@sfx@ npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); npyv_store_@sfx@(dst, c); } + for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ a = *src; *dst = a / scalar; } + npyv_cleanup(); } - /**end repeat**/ #endif +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ - -// XXX Need to see what can be done for 64 bits /**begin repeat * Unsigned types - * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# - * #SIGNED_TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ -#if NPY_BITSOF_@SIGNED_TYPE@ <= 8 - #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u8 -#elif NPY_BITSOF_@SIGNED_TYPE@ <= 16 - #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u16 -#elif NPY_BITSOF_@SIGNED_TYPE@ <= 32 - #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u32 -#else - #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u64 -#endif -static NPY_INLINE int -run_binary_simd_divide_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ - BINARY_DEFS - - if (n == 0) { - return 1; - } - - const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - BINARY_LOOP_SLIDING { - *((@type@ *)op1) = 0; - } - return 1; - } -#if defined NPY_SIMD - #ifdef NPY_HAVE_AVX512F - const npy_intp vector_size_bytes = 64; - #elif defined NPY_HAVE_AVX2 - const npy_intp vector_size_bytes = 32; - #else - const npy_intp vector_size_bytes = 16; - #endif - // XXX Implement other loops - if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), (npy_uintp)vector_size_bytes)) { - simd_divide_by_scalar_@type@(ip1, in2, op1, n); - return 1; - } +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_BITSOF_@STYPE@ == @len@ + #define TO_SIMD_SFX(X) X##_u@len@ +/**end repeat1**/ #endif - return 0; -} -/**end repeat**/ -/**begin repeat - * Unsigned types - * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# - */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(@type@) { - io1 /= *(@type@ *)ip2; + const @type@ d = *(@type@ *)ip2; + if (NPY_UNLIKELY(d == 0)) { + npy_set_floatstatus_divbyzero(); + io1 = 0; + } else { + io1 /= d; + } } *((@type@ *)iop1) = io1; } - else if (!run_binary_simd_divide_@TYPE@(args, dimensions, steps)) { +#if NPY_SIMD && defined(TO_SIMD_SFX) + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && + (*(@type@ *)args[1]) != 0) { + TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); + } +#endif + else { BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { + if (NPY_UNLIKELY(in2 == 0)) { npy_set_floatstatus_divbyzero(); *((@type@ *)op1) = 0; + } else{ + *((@type@ *)op1) = in1 / in2; } - *((@type@ *)op1) = in1 / in2; } } } |
