ENH: add inplace cases to fast ufunc loop macros

Both gcc and clang don't automatically specialize the inplace case, so add extra conditions to the loop macros to get the compilers to emit decent code. Without them inplace code ends up much slower than the out of place code.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2016-08-31 20:15:16 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2016-09-01 09:58:24 +0200
commit: fd298a341ddeb05c471c8dfc16f4cc641d08f8a7 (patch)
tree: 65ff792e70bd848d90250195ee2f03b50618d905
parent: a93d9f7a97358e618aa52b2bbfa119317ee56d08 (diff)
download: numpy-fd298a341ddeb05c471c8dfc16f4cc641d08f8a7.tar.gz
3 files changed, 89 insertions, 33 deletions
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
index 0140718e0..1baee1340 100644
--- a/benchmarks/benchmarks/bench_ufunc.py
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -77,6 +77,45 @@ class Custom(Benchmark):
         (self.b | self.b)
 
 
+class CustomInplace(Benchmark):
+    def setup(self):
+        self.c = np.ones(500000, dtype=np.int8)
+        self.i = np.ones(150000, dtype=np.int32)
+        self.f = np.zeros(150000, dtype=np.float32)
+        self.d = np.zeros(75000, dtype=np.float64)
+        # fault memory
+        self.f *= 1.
+        self.d *= 1.
+
+    def time_char_or(self):
+        np.bitwise_or(self.c, 0, out=self.c)
+        np.bitwise_or(0, self.c, out=self.c)
+
+    def time_char_or_temp(self):
+        0 | self.c | 0
+
+    def time_int_or(self):
+        np.bitwise_or(self.i, 0, out=self.i)
+        np.bitwise_or(0, self.i, out=self.i)
+
+    def time_int_or_temp(self):
+        0 | self.i | 0
+
+    def time_float_add(self):
+        np.add(self.f, 1., out=self.f)
+        np.add(1., self.f, out=self.f)
+
+    def time_float_add_temp(self):
+        1. + self.f + 1.
+
+    def time_double_add(self):
+        np.add(self.d, 1., out=self.d)
+        np.add(1., self.d, out=self.d)
+
+    def time_double_add_temp(self):
+        1. + self.d + 1.
+
+
 class CustomScalar(Benchmark):
     params = [np.float32, np.float64]
     param_names = ['dtype']
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 157b30e70..2720c361f 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -87,22 +87,25 @@
  * combine with NPY_GCC_OPT_3 to allow autovectorization
  * should only be used where its worthwhile to avoid code bloat
  */
+#define BASE_UNARY_LOOP(tin, tout, op) \
+    UNARY_LOOP { \
+        const tin in = *(tin *)ip1; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
 #define UNARY_LOOP_FAST(tin, tout, op) \
     do { \
     /* condition allows compiler to optimize the generic macro */ \
     if (IS_UNARY_CONT(tin, tout)) { \
-        UNARY_LOOP { \
-            const tin in = *(tin *)ip1; \
-            tout * out = (tout *)op1; \
-            op; \
+        if (args[0] == args[1]) { \
+            BASE_UNARY_LOOP(tin, tout, op) \
+        } \
+        else { \
+            BASE_UNARY_LOOP(tin, tout, op) \
         } \
     } \
     else { \
-        UNARY_LOOP { \
-            const tin in = *(tin *)ip1; \
-            tout * out = (tout *)op1; \
-            op; \
-        } \
+        BASE_UNARY_LOOP(tin, tout, op) \
     } \
     } \
     while (0)
@@ -128,40 +131,52 @@
  * combine with NPY_GCC_OPT_3 to allow autovectorization
  * should only be used where its worthwhile to avoid code bloat
  */
+#define BASE_BINARY_LOOP(tin, tout, op) \
+    BINARY_LOOP { \
+        const tin in1 = *(tin *)ip1; \
+        const tin in2 = *(tin *)ip2; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
+#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
+    const tin cin = *(tin *)cinp; \
+    BINARY_LOOP { \
+        const tin vin = *(tin *)vinp; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
 #define BINARY_LOOP_FAST(tin, tout, op) \
     do { \
     /* condition allows compiler to optimize the generic macro */ \
     if (IS_BINARY_CONT(tin, tout)) { \
-        BINARY_LOOP { \
-            const tin in1 = *(tin *)ip1; \
-            const tin in2 = *(tin *)ip2; \
-            tout * out = (tout *)op1; \
-            op; \
+        if (args[2] == args[0]) { \
+            BASE_BINARY_LOOP(tin, tout, op) \
+        } \
+        else if (args[2] == args[1]) { \
+            BASE_BINARY_LOOP(tin, tout, op) \
+        } \
+        else { \
+            BASE_BINARY_LOOP(tin, tout, op) \
         } \
     } \
     else if (IS_BINARY_CONT_S1(tin, tout)) { \
-        const tin in1 = *(tin *)args[0]; \
-        BINARY_LOOP { \
-            const tin in2 = *(tin *)ip2; \
-            tout * out = (tout *)op1; \
-            op; \
+        if (args[1] == args[2]) { \
+            BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
+        } \
+        else { \
+            BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
         } \
     } \
     else if (IS_BINARY_CONT_S2(tin, tout)) { \
-        const tin in2 = *(tin *)args[1]; \
-        BINARY_LOOP { \
-            const tin in1 = *(tin *)ip1; \
-            tout * out = (tout *)op1; \
-            op; \
+        if (args[0] == args[2]) { \
+            BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
         } \
+        else { \
+            BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
+        }\
     } \
     else { \
-        BINARY_LOOP { \
-            const tin in1 = *(tin *)ip1; \
-            const tin in2 = *(tin *)ip2; \
-            tout * out = (tout *)op1; \
-            op; \
-        } \
+        BASE_BINARY_LOOP(tin, tout, op) \
     } \
     } \
     while (0)
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index 1c71565f4..f9aeb6382 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -65,7 +65,7 @@ class TestBaseMath(TestCase):
     def test_blocked(self):
         # test alignments offsets for simd instructions
         # alignments for vz + 2 * (vs - 1) + 1
-        for dt, sz in [(np.float32, 11), (np.float64, 7)]:
+        for dt, sz in [(np.float32, 11), (np.float64, 7), (np.int32, 11)]:
             for out, inp1, inp2, msg in _gen_alignment_data(dtype=dt,
                                                             type='binary',
                                                             max_size=sz):
@@ -82,8 +82,10 @@ class TestBaseMath(TestCase):
                 inp2[...] += np.arange(inp2.size, dtype=dt) + 1
                 assert_almost_equal(np.square(inp2),
                                     np.multiply(inp2, inp2),  err_msg=msg)
-                assert_almost_equal(np.reciprocal(inp2),
-                                    np.divide(1, inp2),  err_msg=msg)
+                # skip true divide for ints
+                if dt != np.int32 or sys.version_info.major < 3:
+                    assert_almost_equal(np.reciprocal(inp2),
+                                        np.divide(1, inp2),  err_msg=msg)
 
                 inp1[...] = np.ones_like(inp1)
                 inp2[...] = np.zeros_like(inp2)
author	Julian Taylor <jtaylor.debian@googlemail.com>	2016-08-31 20:15:16 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2016-09-01 09:58:24 +0200
commit	fd298a341ddeb05c471c8dfc16f4cc641d08f8a7 (patch)
tree	65ff792e70bd848d90250195ee2f03b50618d905
parent	a93d9f7a97358e618aa52b2bbfa119317ee56d08 (diff)
download	numpy-fd298a341ddeb05c471c8dfc16f4cc641d08f8a7.tar.gz