SIMD, MAINT: Refined kernel and inner ufunc functions

author: Ganesh Kathiresan <ganesh3597@gmail.com> 2021-03-20 16:22:06 +0530
committer: Ganesh Kathiresan <ganesh3597@gmail.com> 2021-03-20 16:22:13 +0530
commit: bbb143646cbaad2866ed401ca3c795f083285f78 (patch)
tree: 28220dc64efe91ff41898c56a76b29f005473792 /numpy/core/src
parent: 71e84dcd2ec1a59b6426f05b9095a3a2fd51c01d (diff)
download: numpy-bbb143646cbaad2866ed401ca3c795f083285f78.tar.gz
1 files changed, 43 insertions, 66 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 0e68f1b7b..a012d50dd 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -1,6 +1,6 @@
 /*@targets
  ** $maxopt baseline
- ** sse2 sse41 avx2 avx512_skx
+ ** sse2 sse41 avx2 avx512f avx512_skx
  ** vsx2
  ** neon
  **/
@@ -12,26 +12,26 @@
 #include "loops_utils.h"
 #include "loops.h"
 #include "lowlevel_strided_loops.h"
-#include<signal.h> 
 // Provides the various *_LOOP macros
 #include "fast_loop_macros.h"
 
 //###############################################################################
-//## Unsigned Integers
+//## Division
 //###############################################################################
 /********************************************************************************
  ** Defining the SIMD kernels
  ********************************************************************************/
-#ifdef NPY_SIMD
+#if NPY_SIMD
 /**begin repeat
  *  #sfx = u8, u16, u32, u64#
  */
-
-static void simd_divide_by_scalar_contig_contig_@sfx@
-(npyv_lanetype_@sfx@ *src, const npyv_lanetype_@sfx@ scalar, npyv_lanetype_@sfx@ *dst,
- int len)
+static NPY_INLINE void
+simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
-    const int vstep = npyv_nlanes_@sfx@;
+    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
+    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
+    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
+    const int vstep            = npyv_nlanes_@sfx@;
     const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
@@ -39,92 +39,69 @@ static void simd_divide_by_scalar_contig_contig_@sfx@
         npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
         npyv_store_@sfx@(dst, c);
     }
+
     for (; len > 0; --len, ++src, ++dst) {
         const npyv_lanetype_@sfx@ a = *src;
         *dst = a / scalar;
     }
+
     npyv_cleanup();
 }
-
 /**end repeat**/
 #endif
 
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
 
-
-// XXX Need to see what can be done for 64 bits
 /**begin repeat
  * Unsigned types
- *  #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
- *  #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
- *  #SIGNED_TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG#
+ *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG#
  */
-#if NPY_BITSOF_@SIGNED_TYPE@ <= 8
-    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u8
-#elif NPY_BITSOF_@SIGNED_TYPE@ <= 16
-    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u16
-#elif NPY_BITSOF_@SIGNED_TYPE@ <= 32
-    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u32
-#else
-    #define simd_divide_by_scalar_@type@ simd_divide_by_scalar_contig_contig_u64
-#endif
-static NPY_INLINE int
-run_binary_simd_divide_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    BINARY_DEFS
-
-    if (n == 0) {
-        return 1;
-    }
-
-    const @type@ in2 = *(@type@ *)ip2;
-    if (in2 == 0) {
-        npy_set_floatstatus_divbyzero();
-        BINARY_LOOP_SLIDING {
-            *((@type@ *)op1) = 0;
-        }
-        return 1;
-    }
-#if defined NPY_SIMD
-    #ifdef NPY_HAVE_AVX512F
-        const npy_intp vector_size_bytes = 64;
-    #elif defined NPY_HAVE_AVX2
-        const npy_intp vector_size_bytes = 32;
-    #else
-        const npy_intp vector_size_bytes = 16;
-    #endif
-    // XXX Implement other loops
-    if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), (npy_uintp)vector_size_bytes)) {
-        simd_divide_by_scalar_@type@(ip1, in2, op1, n);
-        return 1;
-    }
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@STYPE@ == @len@
+    #define TO_SIMD_SFX(X) X##_u@len@
+/**end repeat1**/
 #endif
-    return 0;
-}
-/**end repeat**/ 
 
-/**begin repeat
- * Unsigned types
- *  #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
- *  #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
- */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP(@type@) {
-            io1 /= *(@type@ *)ip2;
+            const @type@ d = *(@type@ *)ip2;
+            if (NPY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
         }
         *((@type@ *)iop1) = io1;
     }
-    else if (!run_binary_simd_divide_@TYPE@(args, dimensions, steps)) {
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
+             (*(@type@ *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
         BINARY_LOOP {
             const @type@ in1 = *(@type@ *)ip1;
             const @type@ in2 = *(@type@ *)ip2;
-            if (in2 == 0) {
+            if (NPY_UNLIKELY(in2 == 0)) {
                 npy_set_floatstatus_divbyzero();
                 *((@type@ *)op1) = 0;
+            } else{
+                *((@type@ *)op1) = in1 / in2;
             }
-            *((@type@ *)op1) = in1 / in2;
         }
     }
 }
author	Ganesh Kathiresan <ganesh3597@gmail.com>	2021-03-20 16:22:06 +0530
committer	Ganesh Kathiresan <ganesh3597@gmail.com>	2021-03-20 16:22:13 +0530
commit	bbb143646cbaad2866ed401ca3c795f083285f78 (patch)
tree	28220dc64efe91ff41898c56a76b29f005473792 /numpy/core/src
parent	71e84dcd2ec1a59b6426f05b9095a3a2fd51c01d (diff)
download	numpy-bbb143646cbaad2866ed401ca3c795f083285f78.tar.gz