diff options
| author | Ganesh Kathiresan <ganesh3597@gmail.com> | 2021-03-08 14:17:19 +0530 |
|---|---|---|
| committer | Ganesh Kathiresan <ganesh3597@gmail.com> | 2021-03-20 16:22:13 +0530 |
| commit | 50752aa920be32b74c1a7d0e4242e84b15ffa73c (patch) | |
| tree | 5b956f368d9bafcc2c715560513eadd1869c352e | |
| parent | 2ccc7942f072b7fdbe54f148cdcb6a7d79388a89 (diff) | |
| download | numpy-50752aa920be32b74c1a7d0e4242e84b15ffa73c.tar.gz | |
ENH, SIMD: Added integer dispatch
| -rw-r--r-- | numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src new file mode 100644 index 000000000..0e68f1b7b --- /dev/null +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -0,0 +1,131 @@ +/*@targets + ** $maxopt baseline + ** sse2 sse41 avx2 avx512_skx + ** vsx2 + ** neon + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +#include<signal.h> +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +//############################################################################### +//## Unsigned Integers +//############################################################################### +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +#ifdef NPY_SIMD +/**begin repeat + * #sfx = u8, u16, u32, u64# + */ + +static void simd_divide_by_scalar_contig_contig_@sfx@ +(npyv_lanetype_@sfx@ *src, const npyv_lanetype_@sfx@ scalar, npyv_lanetype_@sfx@ *dst, + int len) +{ + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); + + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); + npyv_store_@sfx@(dst, c); + } + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_@sfx@ a = *src; + *dst = a / scalar; + } + npyv_cleanup(); +} + +/**end repeat**/ +#endif + + + +// XXX Need to see what can be done for 64 bits +/**begin repeat + * Unsigned types + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + * #SIGNED_TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ +#if NPY_BITSOF_@SIGNED_TYPE@ <= 8 + #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u8 +#elif NPY_BITSOF_@SIGNED_TYPE@ <= 16 + #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u16 +#elif NPY_BITSOF_@SIGNED_TYPE@ <= 32 + #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u32 +#else + #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u64 +#endif +static NPY_INLINE int +run_binary_simd_divide_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + BINARY_DEFS + + if (n == 0) { + return 1; + } + + const @type@ in2 = *(@type@ *)ip2; + if (in2 == 0) { + npy_set_floatstatus_divbyzero(); + BINARY_LOOP_SLIDING { + *((@type@ *)op1) = 0; + } + return 1; + } +#if defined NPY_SIMD + #ifdef NPY_HAVE_AVX512F + const npy_intp vector_size_bytes = 64; + #elif defined NPY_HAVE_AVX2 + const npy_intp vector_size_bytes = 32; + #else + const npy_intp vector_size_bytes = 16; + #endif + // XXX Implement other loops + if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), (npy_uintp)vector_size_bytes)) { + simd_divide_by_scalar_@type@(ip1, in2, op1, n); + return 1; + } +#endif + return 0; +} +/**end repeat**/ + +/**begin repeat + * Unsigned types + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(@type@) { + io1 /= *(@type@ *)ip2; + } + *((@type@ *)iop1) = io1; + } + else if (!run_binary_simd_divide_@TYPE@(args, dimensions, steps)) { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + if (in2 == 0) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } + *((@type@ *)op1) = in1 / in2; + } + } +} +/**end repeat**/ |
