ENH: Vectorize float absolute operation with sse2

fabs on x86 can be implemented by masking out the sign bit. Obtaining such a bit pattern is best done by a bitwise not on the negative zero. This is the same operation the compiler will convert fabs to on amd64. Improves performance by ~1.7/3.5 for float/double for cached data and ~1.4/1.1 for non-cached data. If one simplifies the loops gcc could also autovectorize it but with all hints its almost the same code length and slightly worse assembly. The code can easily be extended to support AVX by changing vpre and vtype to 256.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2013-06-07 19:22:51 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2013-06-08 20:44:05 +0200
commit: 9d5884b0acf935401f0f8e64912b98abc73f62c3 (patch)
tree: ead794dec512d321c01b3527ba90a431988d3bbd /numpy
parent: abad5e3a753a2d0f5bbd7bdf4e8769cf9a4ef02d (diff)
download: numpy-9d5884b0acf935401f0f8e64912b98abc73f62c3.tar.gz
2 files changed, 76 insertions, 5 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index de78904bb..e15909029 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1302,8 +1302,11 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
  *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #c = f, #
  *  #vtype = __m128, __m128d#
+ *  #vpre = _mm, _mm#
  *  #vsuf = ps, pd#
+*   #vtype = __m128, __m128d#
  */
 
 #ifdef HAVE_EMMINTRIN_H
@@ -1335,6 +1338,40 @@ sse2_sqrt_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
 }
 
 
+static void
+sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+    /*
+     * get 0x7FFFFFFF mask (everything but signbit set)
+     * float & ~mask will remove the sign
+     * this is equivalent to how the compiler implements fabs on amd64
+     */
+    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
+
+    UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
+        const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+        /* add 0 to clear -0.0 */
+        op1[i] = tmp + 0;
+    }
+    if (npy_is_aligned(&ip1[i], 16)) {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+            @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+        }
+    }
+    else {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+            @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+        }
+    }
+    UNARY_LOOP_BLOCKED_END {
+        const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+        /* add 0 to clear -0.0 */
+        op1[i] = tmp + 0;
+    }
+}
+
 #endif
 
 
@@ -1582,11 +1619,19 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
-    UNARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ tmp = in1 > 0 ? in1 : -in1;
-        /* add 0 to clear -0.0 */
-        *((@type@ *)op1) = tmp + 0;
+#if defined HAVE_EMMINTRIN_H && defined NPY_HAVE_SIMD_@TYPE@
+    if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
+        sse2_absolute_@TYPE@(args, dimensions, steps);
+    }
+    else
+#endif
+    {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ tmp = in1 > 0 ? in1 : -in1;
+            /* add 0 to clear -0.0 */
+            *((@type@ *)op1) = tmp + 0;
+        }
     }
 }
 
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 586b64636..6bbb15e6b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -687,6 +687,32 @@ class TestSign(TestCase):
             np.seterr(**olderr)
 
 
+class TestAbsolute(TestCase):
+    def test_abs_blocked(self):
+        "simd tests on abs"
+        for dt in [np.float32, np.float64]:
+            for out, inp, msg in gen_alignment_data(dtype=dt, type='unary',
+                                                    max_size=17):
+                tgt = [ncu.absolute(i) for i in inp]
+                np.absolute(inp, out=out)
+                assert_equal(out, tgt, err_msg=msg)
+                self.assertTrue((out >= 0).all())
+
+                prev = np.geterr()
+                # will throw invalid flag depending on compiler optimizations
+                np.seterr(invalid='ignore')
+                for v in [np.nan, -np.inf, np.inf]:
+                    for i in range(inp.size):
+                        d = np.arange(inp.size, dtype=dt)
+                        inp[:] = -d
+                        inp[i] = v
+                        d[i] = -v if v == -np.inf else v
+                        assert_array_equal(np.abs(inp), d, err_msg=msg)
+                        np.abs(inp, out=out)
+                        assert_array_equal(out, d, err_msg=msg)
+                np.seterr(invalid=prev['invalid'])
+
+
 class TestSpecialMethods(TestCase):
     def test_wrap(self):
         class with_wrap(object):
author	Julian Taylor <jtaylor.debian@googlemail.com>	2013-06-07 19:22:51 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2013-06-08 20:44:05 +0200
commit	9d5884b0acf935401f0f8e64912b98abc73f62c3 (patch)
tree	ead794dec512d321c01b3527ba90a431988d3bbd /numpy
parent	abad5e3a753a2d0f5bbd7bdf4e8769cf9a4ef02d (diff)
download	numpy-9d5884b0acf935401f0f8e64912b98abc73f62c3.tar.gz