diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2016-08-31 20:15:16 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2016-09-01 09:58:24 +0200 |
commit | fd298a341ddeb05c471c8dfc16f4cc641d08f8a7 (patch) | |
tree | 65ff792e70bd848d90250195ee2f03b50618d905 | |
parent | a93d9f7a97358e618aa52b2bbfa119317ee56d08 (diff) | |
download | numpy-fd298a341ddeb05c471c8dfc16f4cc641d08f8a7.tar.gz |
ENH: add inplace cases to fast ufunc loop macros
Both gcc and clang don't automatically specialize the inplace case, so
add extra conditions to the loop macros to get the compilers to emit
decent code.
Without them inplace code ends up much slower than the out of place
code.
-rw-r--r-- | benchmarks/benchmarks/bench_ufunc.py | 39 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 75 | ||||
-rw-r--r-- | numpy/core/tests/test_scalarmath.py | 8 |
3 files changed, 89 insertions, 33 deletions
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py index 0140718e0..1baee1340 100644 --- a/benchmarks/benchmarks/bench_ufunc.py +++ b/benchmarks/benchmarks/bench_ufunc.py @@ -77,6 +77,45 @@ class Custom(Benchmark): (self.b | self.b) +class CustomInplace(Benchmark): + def setup(self): + self.c = np.ones(500000, dtype=np.int8) + self.i = np.ones(150000, dtype=np.int32) + self.f = np.zeros(150000, dtype=np.float32) + self.d = np.zeros(75000, dtype=np.float64) + # fault memory + self.f *= 1. + self.d *= 1. + + def time_char_or(self): + np.bitwise_or(self.c, 0, out=self.c) + np.bitwise_or(0, self.c, out=self.c) + + def time_char_or_temp(self): + 0 | self.c | 0 + + def time_int_or(self): + np.bitwise_or(self.i, 0, out=self.i) + np.bitwise_or(0, self.i, out=self.i) + + def time_int_or_temp(self): + 0 | self.i | 0 + + def time_float_add(self): + np.add(self.f, 1., out=self.f) + np.add(1., self.f, out=self.f) + + def time_float_add_temp(self): + 1. + self.f + 1. + + def time_double_add(self): + np.add(self.d, 1., out=self.d) + np.add(1., self.d, out=self.d) + + def time_double_add_temp(self): + 1. + self.d + 1. + + class CustomScalar(Benchmark): params = [np.float32, np.float64] param_names = ['dtype'] diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 157b30e70..2720c361f 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -87,22 +87,25 @@ * combine with NPY_GCC_OPT_3 to allow autovectorization * should only be used where its worthwhile to avoid code bloat */ +#define BASE_UNARY_LOOP(tin, tout, op) \ + UNARY_LOOP { \ + const tin in = *(tin *)ip1; \ + tout * out = (tout *)op1; \ + op; \ + } #define UNARY_LOOP_FAST(tin, tout, op) \ do { \ /* condition allows compiler to optimize the generic macro */ \ if (IS_UNARY_CONT(tin, tout)) { \ - UNARY_LOOP { \ - const tin in = *(tin *)ip1; \ - tout * out = (tout *)op1; \ - op; \ + if (args[0] == args[1]) { \ + BASE_UNARY_LOOP(tin, tout, op) \ + } \ + else { \ + BASE_UNARY_LOOP(tin, tout, op) \ } \ } \ else { \ - UNARY_LOOP { \ - const tin in = *(tin *)ip1; \ - tout * out = (tout *)op1; \ - op; \ - } \ + BASE_UNARY_LOOP(tin, tout, op) \ } \ } \ while (0) @@ -128,40 +131,52 @@ * combine with NPY_GCC_OPT_3 to allow autovectorization * should only be used where its worthwhile to avoid code bloat */ +#define BASE_BINARY_LOOP(tin, tout, op) \ + BINARY_LOOP { \ + const tin in1 = *(tin *)ip1; \ + const tin in2 = *(tin *)ip2; \ + tout * out = (tout *)op1; \ + op; \ + } +#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \ + const tin cin = *(tin *)cinp; \ + BINARY_LOOP { \ + const tin vin = *(tin *)vinp; \ + tout * out = (tout *)op1; \ + op; \ + } #define BINARY_LOOP_FAST(tin, tout, op) \ do { \ /* condition allows compiler to optimize the generic macro */ \ if (IS_BINARY_CONT(tin, tout)) { \ - BINARY_LOOP { \ - const tin in1 = *(tin *)ip1; \ - const tin in2 = *(tin *)ip2; \ - tout * out = (tout *)op1; \ - op; \ + if (args[2] == args[0]) { \ + BASE_BINARY_LOOP(tin, tout, op) \ + } \ + else if (args[2] == args[1]) { \ + BASE_BINARY_LOOP(tin, tout, op) \ + } \ + else { \ + BASE_BINARY_LOOP(tin, tout, op) \ } \ } \ else if (IS_BINARY_CONT_S1(tin, tout)) { \ - const tin in1 = *(tin *)args[0]; \ - BINARY_LOOP { \ - const tin in2 = *(tin *)ip2; \ - tout * out = (tout *)op1; \ - op; \ + if (args[1] == args[2]) { \ + BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \ + } \ + else { \ + BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \ } \ } \ else if (IS_BINARY_CONT_S2(tin, tout)) { \ - const tin in2 = *(tin *)args[1]; \ - BINARY_LOOP { \ - const tin in1 = *(tin *)ip1; \ - tout * out = (tout *)op1; \ - op; \ + if (args[0] == args[2]) { \ + BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \ } \ + else { \ + BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \ + }\ } \ else { \ - BINARY_LOOP { \ - const tin in1 = *(tin *)ip1; \ - const tin in2 = *(tin *)ip2; \ - tout * out = (tout *)op1; \ - op; \ - } \ + BASE_BINARY_LOOP(tin, tout, op) \ } \ } \ while (0) diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py index 1c71565f4..f9aeb6382 100644 --- a/numpy/core/tests/test_scalarmath.py +++ b/numpy/core/tests/test_scalarmath.py @@ -65,7 +65,7 @@ class TestBaseMath(TestCase): def test_blocked(self): # test alignments offsets for simd instructions # alignments for vz + 2 * (vs - 1) + 1 - for dt, sz in [(np.float32, 11), (np.float64, 7)]: + for dt, sz in [(np.float32, 11), (np.float64, 7), (np.int32, 11)]: for out, inp1, inp2, msg in _gen_alignment_data(dtype=dt, type='binary', max_size=sz): @@ -82,8 +82,10 @@ class TestBaseMath(TestCase): inp2[...] += np.arange(inp2.size, dtype=dt) + 1 assert_almost_equal(np.square(inp2), np.multiply(inp2, inp2), err_msg=msg) - assert_almost_equal(np.reciprocal(inp2), - np.divide(1, inp2), err_msg=msg) + # skip true divide for ints + if dt != np.int32 or sys.version_info.major < 3: + assert_almost_equal(np.reciprocal(inp2), + np.divide(1, inp2), err_msg=msg) inp1[...] = np.ones_like(inp1) inp2[...] = np.zeros_like(inp2) |