summaryrefslogtreecommitdiff
path: root/numpy/core/src
diff options
context:
space:
mode:
authorGanesh Kathiresan <ganesh3597@gmail.com>2021-03-20 16:22:06 +0530
committerGanesh Kathiresan <ganesh3597@gmail.com>2021-03-20 16:22:13 +0530
commitbbb143646cbaad2866ed401ca3c795f083285f78 (patch)
tree28220dc64efe91ff41898c56a76b29f005473792 /numpy/core/src
parent71e84dcd2ec1a59b6426f05b9095a3a2fd51c01d (diff)
downloadnumpy-bbb143646cbaad2866ed401ca3c795f083285f78.tar.gz
SIMD, MAINT: Refined kernel and inner ufunc functions
Diffstat (limited to 'numpy/core/src')
-rw-r--r--numpy/core/src/umath/loops_arithmetic.dispatch.c.src109
1 files changed, 43 insertions, 66 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 0e68f1b7b..a012d50dd 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -1,6 +1,6 @@
/*@targets
** $maxopt baseline
- ** sse2 sse41 avx2 avx512_skx
+ ** sse2 sse41 avx2 avx512f avx512_skx
** vsx2
** neon
**/
@@ -12,26 +12,26 @@
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
-#include<signal.h>
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"
//###############################################################################
-//## Unsigned Integers
+//## Division
//###############################################################################
/********************************************************************************
** Defining the SIMD kernels
********************************************************************************/
-#ifdef NPY_SIMD
+#if NPY_SIMD
/**begin repeat
* #sfx = u8, u16, u32, u64#
*/
-
-static void simd_divide_by_scalar_contig_contig_@sfx@
-(npyv_lanetype_@sfx@ *src, const npyv_lanetype_@sfx@ scalar, npyv_lanetype_@sfx@ *dst,
- int len)
+static NPY_INLINE void
+simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
- const int vstep = npyv_nlanes_@sfx@;
+ npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0];
+ npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
+ npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2];
+ const int vstep = npyv_nlanes_@sfx@;
const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
@@ -39,92 +39,69 @@ static void simd_divide_by_scalar_contig_contig_@sfx@
npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
npyv_store_@sfx@(dst, c);
}
+
for (; len > 0; --len, ++src, ++dst) {
const npyv_lanetype_@sfx@ a = *src;
*dst = a / scalar;
}
+
npyv_cleanup();
}
-
/**end repeat**/
#endif
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
-
-// XXX Need to see what can be done for 64 bits
/**begin repeat
* Unsigned types
- * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
- * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
- * #SIGNED_TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG#
*/
-#if NPY_BITSOF_@SIGNED_TYPE@ <= 8
- #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u8
-#elif NPY_BITSOF_@SIGNED_TYPE@ <= 16
- #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u16
-#elif NPY_BITSOF_@SIGNED_TYPE@ <= 32
- #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u32
-#else
- #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u64
-#endif
-static NPY_INLINE int
-run_binary_simd_divide_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
- BINARY_DEFS
-
- if (n == 0) {
- return 1;
- }
-
- const @type@ in2 = *(@type@ *)ip2;
- if (in2 == 0) {
- npy_set_floatstatus_divbyzero();
- BINARY_LOOP_SLIDING {
- *((@type@ *)op1) = 0;
- }
- return 1;
- }
-#if defined NPY_SIMD
- #ifdef NPY_HAVE_AVX512F
- const npy_intp vector_size_bytes = 64;
- #elif defined NPY_HAVE_AVX2
- const npy_intp vector_size_bytes = 32;
- #else
- const npy_intp vector_size_bytes = 16;
- #endif
- // XXX Implement other loops
- if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), (npy_uintp)vector_size_bytes)) {
- simd_divide_by_scalar_@type@(ip1, in2, op1, n);
- return 1;
- }
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@STYPE@ == @len@
+ #define TO_SIMD_SFX(X) X##_u@len@
+/**end repeat1**/
#endif
- return 0;
-}
-/**end repeat**/
-/**begin repeat
- * Unsigned types
- * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
- * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
- */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
if (IS_BINARY_REDUCE) {
BINARY_REDUCE_LOOP(@type@) {
- io1 /= *(@type@ *)ip2;
+ const @type@ d = *(@type@ *)ip2;
+ if (NPY_UNLIKELY(d == 0)) {
+ npy_set_floatstatus_divbyzero();
+ io1 = 0;
+ } else {
+ io1 /= d;
+ }
}
*((@type@ *)iop1) = io1;
}
- else if (!run_binary_simd_divide_@TYPE@(args, dimensions, steps)) {
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+ // for contiguous block of memory, divisor is a scalar and not 0
+ else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
+ (*(@type@ *)args[1]) != 0) {
+ TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+ }
+#endif
+ else {
BINARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
- if (in2 == 0) {
+ if (NPY_UNLIKELY(in2 == 0)) {
npy_set_floatstatus_divbyzero();
*((@type@ *)op1) = 0;
+ } else{
+ *((@type@ *)op1) = in1 / in2;
}
- *((@type@ *)op1) = in1 / in2;
}
}
}