summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorJulian Taylor <jtaylor.debian@googlemail.com>2017-03-27 14:55:35 +0200
committerJulian Taylor <jtaylor.debian@googlemail.com>2017-03-27 15:26:09 +0200
commit1964b6a4c5ee377e6489923cbc6b2c61b013a6d5 (patch)
treefe079c7ca87cc711a8f8f4e1c0ec54b6e0786ce0 /numpy
parent1e8143cc4a1c29ec1ca3aa368363839f4d163648 (diff)
downloadnumpy-1964b6a4c5ee377e6489923cbc6b2c61b013a6d5.tar.gz
MAINT: restore auto-vectorization of inplace operations
GCC 6/7 lost the ability to vectorize inplace operations with our current hinting. This causes inplace operations to become slower than out of place operations which is bad, especially as we automatically avoid temporaries now. This issue has been filed in GCC PR80198. Luckily gcc also has a no loop dependence pragma which we can use to enforce the vectorization in the inplace code path. In the inplace scalar path an extra code hint is sufficient.
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/loops.c.src39
1 files changed, 35 insertions, 4 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 107d525fc..24364afbd 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -138,6 +138,29 @@
tout * out = (tout *)op1; \
op; \
}
+/*
+ * unfortunately gcc 6/7 regressed and we need to give it additional hints to
+ * vectorize inplace operations (PR80198)
+ * must only be used after op1 == ip1 or ip2 has been checked
+ * TODO: using ivdep might allow other compilers to vectorize too
+ */
+#if __GNUC__ >= 6
+#define IVDEP_LOOP _Pragma("GCC ivdep")
+#else
+#define IVDEP_LOOP
+#endif
+#define BASE_BINARY_LOOP_INP(tin, tout, op) \
+ char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
+ npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
+ npy_intp n = dimensions[0];\
+ npy_intp i;\
+ IVDEP_LOOP \
+ for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
+ const tin in1 = *(tin *)ip1; \
+ const tin in2 = *(tin *)ip2; \
+ tout * out = (tout *)op1; \
+ op; \
+ }
#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
const tin cin = *(tin *)cinp; \
BINARY_LOOP { \
@@ -145,15 +168,23 @@
tout * out = (tout *)op1; \
op; \
}
+/* PR80198 again, scalar works without the pragma */
+#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
+ const tin cin = *(tin *)cinp; \
+ BINARY_LOOP { \
+ const tin vin = *(tin *)vinp; \
+ tout * out = (tout *)vinp; \
+ op; \
+ }
#define BINARY_LOOP_FAST(tin, tout, op) \
do { \
/* condition allows compiler to optimize the generic macro */ \
if (IS_BINARY_CONT(tin, tout)) { \
if (args[2] == args[0]) { \
- BASE_BINARY_LOOP(tin, tout, op) \
+ BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else if (args[2] == args[1]) { \
- BASE_BINARY_LOOP(tin, tout, op) \
+ BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else { \
BASE_BINARY_LOOP(tin, tout, op) \
@@ -161,7 +192,7 @@
} \
else if (IS_BINARY_CONT_S1(tin, tout)) { \
if (args[1] == args[2]) { \
- BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
+ BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
} \
else { \
BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
@@ -169,7 +200,7 @@
} \
else if (IS_BINARY_CONT_S2(tin, tout)) { \
if (args[0] == args[2]) { \
- BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
+ BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
} \
else { \
BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \