MAINT: restore auto-vectorization of inplace operations

GCC 6/7 lost the ability to vectorize inplace operations with our current hinting. This causes inplace operations to become slower than out of place operations which is bad, especially as we automatically avoid temporaries now. This issue has been filed in GCC PR80198. Luckily gcc also has a no loop dependence pragma which we can use to enforce the vectorization in the inplace code path. In the inplace scalar path an extra code hint is sufficient.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2017-03-27 14:55:35 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2017-03-27 15:26:09 +0200
commit: 1964b6a4c5ee377e6489923cbc6b2c61b013a6d5 (patch)
tree: fe079c7ca87cc711a8f8f4e1c0ec54b6e0786ce0 /numpy
parent: 1e8143cc4a1c29ec1ca3aa368363839f4d163648 (diff)
download: numpy-1964b6a4c5ee377e6489923cbc6b2c61b013a6d5.tar.gz
1 files changed, 35 insertions, 4 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 107d525fc..24364afbd 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -138,6 +138,29 @@
         tout * out = (tout *)op1; \
         op; \
     }
+/*
+ * unfortunately gcc 6/7 regressed and we need to give it additional hints to
+ * vectorize inplace operations (PR80198)
+ * must only be used after op1 == ip1 or ip2 has been checked
+ * TODO: using ivdep might allow other compilers to vectorize too
+ */
+#if __GNUC__ >= 6
+#define IVDEP_LOOP _Pragma("GCC ivdep")
+#else
+#define IVDEP_LOOP
+#endif
+#define BASE_BINARY_LOOP_INP(tin, tout, op) \
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    IVDEP_LOOP \
+    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
+        const tin in1 = *(tin *)ip1; \
+        const tin in2 = *(tin *)ip2; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
 #define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
     const tin cin = *(tin *)cinp; \
     BINARY_LOOP { \
@@ -145,15 +168,23 @@
         tout * out = (tout *)op1; \
         op; \
     }
+/* PR80198 again, scalar works without the pragma */
+#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
+    const tin cin = *(tin *)cinp; \
+    BINARY_LOOP { \
+        const tin vin = *(tin *)vinp; \
+        tout * out = (tout *)vinp; \
+        op; \
+    }
 #define BINARY_LOOP_FAST(tin, tout, op) \
     do { \
     /* condition allows compiler to optimize the generic macro */ \
     if (IS_BINARY_CONT(tin, tout)) { \
         if (args[2] == args[0]) { \
-            BASE_BINARY_LOOP(tin, tout, op) \
+            BASE_BINARY_LOOP_INP(tin, tout, op) \
         } \
         else if (args[2] == args[1]) { \
-            BASE_BINARY_LOOP(tin, tout, op) \
+            BASE_BINARY_LOOP_INP(tin, tout, op) \
         } \
         else { \
             BASE_BINARY_LOOP(tin, tout, op) \
@@ -161,7 +192,7 @@
     } \
     else if (IS_BINARY_CONT_S1(tin, tout)) { \
         if (args[1] == args[2]) { \
-            BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
+            BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
         } \
         else { \
             BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
@@ -169,7 +200,7 @@
     } \
     else if (IS_BINARY_CONT_S2(tin, tout)) { \
         if (args[0] == args[2]) { \
-            BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
+            BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
         } \
         else { \
             BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
author	Julian Taylor <jtaylor.debian@googlemail.com>	2017-03-27 14:55:35 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2017-03-27 15:26:09 +0200
commit	1964b6a4c5ee377e6489923cbc6b2c61b013a6d5 (patch)
tree	fe079c7ca87cc711a8f8f4e1c0ec54b6e0786ce0 /numpy
parent	1e8143cc4a1c29ec1ca3aa368363839f4d163648 (diff)
download	numpy-1964b6a4c5ee377e6489923cbc6b2c61b013a6d5.tar.gz