diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2017-03-27 14:55:35 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2017-03-27 15:26:09 +0200 |
commit | 1964b6a4c5ee377e6489923cbc6b2c61b013a6d5 (patch) | |
tree | fe079c7ca87cc711a8f8f4e1c0ec54b6e0786ce0 /numpy | |
parent | 1e8143cc4a1c29ec1ca3aa368363839f4d163648 (diff) | |
download | numpy-1964b6a4c5ee377e6489923cbc6b2c61b013a6d5.tar.gz |
MAINT: restore auto-vectorization of inplace operations
GCC 6/7 lost the ability to vectorize inplace operations with our
current hinting. This causes inplace operations to become slower than
out of place operations which is bad, especially as we automatically
avoid temporaries now.
This issue has been filed in GCC PR80198.
Luckily gcc also has a no loop dependence pragma which we can use to
enforce the vectorization in the inplace code path.
In the inplace scalar path an extra code hint is sufficient.
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 39 |
1 files changed, 35 insertions, 4 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 107d525fc..24364afbd 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -138,6 +138,29 @@ tout * out = (tout *)op1; \ op; \ } +/* + * unfortunately gcc 6/7 regressed and we need to give it additional hints to + * vectorize inplace operations (PR80198) + * must only be used after op1 == ip1 or ip2 has been checked + * TODO: using ivdep might allow other compilers to vectorize too + */ +#if __GNUC__ >= 6 +#define IVDEP_LOOP _Pragma("GCC ivdep") +#else +#define IVDEP_LOOP +#endif +#define BASE_BINARY_LOOP_INP(tin, tout, op) \ + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\ + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\ + npy_intp n = dimensions[0];\ + npy_intp i;\ + IVDEP_LOOP \ + for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \ + const tin in1 = *(tin *)ip1; \ + const tin in2 = *(tin *)ip2; \ + tout * out = (tout *)op1; \ + op; \ + } #define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \ const tin cin = *(tin *)cinp; \ BINARY_LOOP { \ @@ -145,15 +168,23 @@ tout * out = (tout *)op1; \ op; \ } +/* PR80198 again, scalar works without the pragma */ +#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \ + const tin cin = *(tin *)cinp; \ + BINARY_LOOP { \ + const tin vin = *(tin *)vinp; \ + tout * out = (tout *)vinp; \ + op; \ + } #define BINARY_LOOP_FAST(tin, tout, op) \ do { \ /* condition allows compiler to optimize the generic macro */ \ if (IS_BINARY_CONT(tin, tout)) { \ if (args[2] == args[0]) { \ - BASE_BINARY_LOOP(tin, tout, op) \ + BASE_BINARY_LOOP_INP(tin, tout, op) \ } \ else if (args[2] == args[1]) { \ - BASE_BINARY_LOOP(tin, tout, op) \ + BASE_BINARY_LOOP_INP(tin, tout, op) \ } \ else { \ BASE_BINARY_LOOP(tin, tout, op) \ @@ -161,7 +192,7 @@ } \ else if (IS_BINARY_CONT_S1(tin, tout)) { \ if (args[1] == args[2]) { \ - BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \ + BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \ } \ else { \ BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \ @@ -169,7 +200,7 @@ } \ else if (IS_BINARY_CONT_S2(tin, tout)) { \ if (args[0] == args[2]) { \ - BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \ + BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \ } \ else { \ BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \ |