Merge pull request #7999 from juliantaylor/inplace-opt

ENH: add inplace cases to fast ufunc loop macros
author: seberg <sebastian@sipsolutions.net> 2016-09-02 10:20:24 +0200
committer: GitHub <noreply@github.com> 2016-09-02 10:20:24 +0200
commit: 8eedd3e911b7e3f5f35961871ddb8ee1559d4225 (patch)
tree: cfe1aa43b2e527ee7dd403f6990f4dcf5260cb51 /numpy
parent: a93d9f7a97358e618aa52b2bbfa119317ee56d08 (diff)
parent: d555a0ad0f1191daf8ae83e10933da5556b2510e (diff)
download: numpy-8eedd3e911b7e3f5f35961871ddb8ee1559d4225.tar.gz
4 files changed, 64 insertions, 43 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 157b30e70..2720c361f 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -87,22 +87,25 @@
  * combine with NPY_GCC_OPT_3 to allow autovectorization
  * should only be used where its worthwhile to avoid code bloat
  */
+#define BASE_UNARY_LOOP(tin, tout, op) \
+    UNARY_LOOP { \
+        const tin in = *(tin *)ip1; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
 #define UNARY_LOOP_FAST(tin, tout, op) \
     do { \
     /* condition allows compiler to optimize the generic macro */ \
     if (IS_UNARY_CONT(tin, tout)) { \
-        UNARY_LOOP { \
-            const tin in = *(tin *)ip1; \
-            tout * out = (tout *)op1; \
-            op; \
+        if (args[0] == args[1]) { \
+            BASE_UNARY_LOOP(tin, tout, op) \
+        } \
+        else { \
+            BASE_UNARY_LOOP(tin, tout, op) \
         } \
     } \
     else { \
-        UNARY_LOOP { \
-            const tin in = *(tin *)ip1; \
-            tout * out = (tout *)op1; \
-            op; \
-        } \
+        BASE_UNARY_LOOP(tin, tout, op) \
     } \
     } \
     while (0)
@@ -128,40 +131,52 @@
  * combine with NPY_GCC_OPT_3 to allow autovectorization
  * should only be used where its worthwhile to avoid code bloat
  */
+#define BASE_BINARY_LOOP(tin, tout, op) \
+    BINARY_LOOP { \
+        const tin in1 = *(tin *)ip1; \
+        const tin in2 = *(tin *)ip2; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
+#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
+    const tin cin = *(tin *)cinp; \
+    BINARY_LOOP { \
+        const tin vin = *(tin *)vinp; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
 #define BINARY_LOOP_FAST(tin, tout, op) \
     do { \
     /* condition allows compiler to optimize the generic macro */ \
     if (IS_BINARY_CONT(tin, tout)) { \
-        BINARY_LOOP { \
-            const tin in1 = *(tin *)ip1; \
-            const tin in2 = *(tin *)ip2; \
-            tout * out = (tout *)op1; \
-            op; \
+        if (args[2] == args[0]) { \
+            BASE_BINARY_LOOP(tin, tout, op) \
+        } \
+        else if (args[2] == args[1]) { \
+            BASE_BINARY_LOOP(tin, tout, op) \
+        } \
+        else { \
+            BASE_BINARY_LOOP(tin, tout, op) \
         } \
     } \
     else if (IS_BINARY_CONT_S1(tin, tout)) { \
-        const tin in1 = *(tin *)args[0]; \
-        BINARY_LOOP { \
-            const tin in2 = *(tin *)ip2; \
-            tout * out = (tout *)op1; \
-            op; \
+        if (args[1] == args[2]) { \
+            BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
+        } \
+        else { \
+            BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
         } \
     } \
     else if (IS_BINARY_CONT_S2(tin, tout)) { \
-        const tin in2 = *(tin *)args[1]; \
-        BINARY_LOOP { \
-            const tin in1 = *(tin *)ip1; \
-            tout * out = (tout *)op1; \
-            op; \
+        if (args[0] == args[2]) { \
+            BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
         } \
+        else { \
+            BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
+        }\
     } \
     else { \
-        BINARY_LOOP { \
-            const tin in1 = *(tin *)ip1; \
-            const tin in2 = *(tin *)ip2; \
-            tout * out = (tout *)op1; \
-            op; \
-        } \
+        BASE_BINARY_LOOP(tin, tout, op) \
     } \
     } \
     while (0)
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index 1c71565f4..0ae013f12 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -65,7 +65,7 @@ class TestBaseMath(TestCase):
     def test_blocked(self):
         # test alignments offsets for simd instructions
         # alignments for vz + 2 * (vs - 1) + 1
-        for dt, sz in [(np.float32, 11), (np.float64, 7)]:
+        for dt, sz in [(np.float32, 11), (np.float64, 7), (np.int32, 11)]:
             for out, inp1, inp2, msg in _gen_alignment_data(dtype=dt,
                                                             type='binary',
                                                             max_size=sz):
@@ -73,7 +73,7 @@ class TestBaseMath(TestCase):
                 inp1[...] = np.ones_like(inp1)
                 inp2[...] = np.zeros_like(inp2)
                 assert_almost_equal(np.add(inp1, inp2), exp1, err_msg=msg)
-                assert_almost_equal(np.add(inp1, 1), exp1 + 1, err_msg=msg)
+                assert_almost_equal(np.add(inp1, 2), exp1 + 2, err_msg=msg)
                 assert_almost_equal(np.add(1, inp2), exp1, err_msg=msg)
 
                 np.add(inp1, inp2, out=out)
@@ -82,15 +82,17 @@ class TestBaseMath(TestCase):
                 inp2[...] += np.arange(inp2.size, dtype=dt) + 1
                 assert_almost_equal(np.square(inp2),
                                     np.multiply(inp2, inp2),  err_msg=msg)
-                assert_almost_equal(np.reciprocal(inp2),
-                                    np.divide(1, inp2),  err_msg=msg)
+                # skip true divide for ints
+                if dt != np.int32 or sys.version_info.major < 3:
+                    assert_almost_equal(np.reciprocal(inp2),
+                                        np.divide(1, inp2),  err_msg=msg)
 
                 inp1[...] = np.ones_like(inp1)
-                inp2[...] = np.zeros_like(inp2)
-                np.add(inp1, 1, out=out)
-                assert_almost_equal(out, exp1 + 1, err_msg=msg)
-                np.add(1, inp2, out=out)
-                assert_almost_equal(out, exp1, err_msg=msg)
+                np.add(inp1, 2, out=out)
+                assert_almost_equal(out, exp1 + 2, err_msg=msg)
+                inp2[...] = np.ones_like(inp2)
+                np.add(2, inp2, out=out)
+                assert_almost_equal(out, exp1 + 2, err_msg=msg)
 
     def test_lower_align(self):
         # check data that is not aligned to element size
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 759e996e3..db5621279 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1202,8 +1202,9 @@ class TestAbsoluteNegative(TestCase):
                             assert_array_equal(out, d, err_msg=msg)
 
                             assert_array_equal(-inp, -1*inp, err_msg=msg)
+                            d = -1 * inp
                             np.negative(inp, out=out)
-                            assert_array_equal(out, -1*inp, err_msg=msg)
+                            assert_array_equal(out, d, err_msg=msg)
 
     def test_lower_align(self):
         # check data that is not aligned to element size
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index c7f4a0aa7..9c432f3dc 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -1794,7 +1794,8 @@ def _gen_alignment_data(dtype=float32, type='binary', max_size=24):
                 inp = lambda: arange(s, dtype=dtype)[o:]
                 out = empty((s,), dtype=dtype)[o:]
                 yield out, inp(), ufmt % (o, o, s, dtype, 'out of place')
-                yield inp(), inp(), ufmt % (o, o, s, dtype, 'in place')
+                d = inp()
+                yield d, d, ufmt % (o, o, s, dtype, 'in place')
                 yield out[1:], inp()[:-1], ufmt % \
                     (o + 1, o, s - 1, dtype, 'out of place')
                 yield out[:-1], inp()[1:], ufmt % \
@@ -1809,9 +1810,11 @@ def _gen_alignment_data(dtype=float32, type='binary', max_size=24):
                 out = empty((s,), dtype=dtype)[o:]
                 yield out, inp1(), inp2(),  bfmt % \
                     (o, o, o, s, dtype, 'out of place')
-                yield inp1(), inp1(), inp2(), bfmt % \
+                d = inp1()
+                yield d, d, inp2(), bfmt % \
                     (o, o, o, s, dtype, 'in place1')
-                yield inp2(), inp1(), inp2(), bfmt % \
+                d = inp2()
+                yield d, inp1(), d, bfmt % \
                     (o, o, o, s, dtype, 'in place2')
                 yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % \
                     (o + 1, o, o, s - 1, dtype, 'out of place')
author	seberg <sebastian@sipsolutions.net>	2016-09-02 10:20:24 +0200
committer	GitHub <noreply@github.com>	2016-09-02 10:20:24 +0200
commit	8eedd3e911b7e3f5f35961871ddb8ee1559d4225 (patch)
tree	cfe1aa43b2e527ee7dd403f6990f4dcf5260cb51 /numpy
parent	a93d9f7a97358e618aa52b2bbfa119317ee56d08 (diff)
parent	d555a0ad0f1191daf8ae83e10933da5556b2510e (diff)
download	numpy-8eedd3e911b7e3f5f35961871ddb8ee1559d4225.tar.gz