Merge pull request #3411 from juliantaylor/vectorize-fabs

ENH: Vectorize float absolute operation with sse2
author: Charles Harris <charlesr.harris@gmail.com> 2013-06-08 14:57:15 -0700
committer: Charles Harris <charlesr.harris@gmail.com> 2013-06-08 14:57:15 -0700
commit: 3e471304393244eb02215d1601d2396b3f94eb8d (patch)
tree: c837f5bd2272256f361cb2ec6b0d04a84b023a8d
parent: 23a27572994e1c731605c6a78a1564014c2a62c8 (diff)
parent: 7fb8b714906a92516905cc0f03e45511bd1ac1ed (diff)
download: numpy-3e471304393244eb02215d1601d2396b3f94eb8d.tar.gz
7 files changed, 250 insertions, 67 deletions
diff --git a/doc/release/1.8.0-notes.rst b/doc/release/1.8.0-notes.rst
index 76dcf50c2..c5315e6cd 100644
--- a/doc/release/1.8.0-notes.rst
+++ b/doc/release/1.8.0-notes.rst
@@ -142,6 +142,24 @@ The `pad` function has a new implementation, greatly improving performance for
 all inputs except `mode=<function>` (retained for backwards compatibility).
 Scaling with dimensionality is dramatically improved for rank >= 4.
 
+Performance improvements to `isnan`, `isinf`, `isfinite` and `byteswap`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+`isnan`, `isinf`, `isfinite` and `byteswap` have been improved to take
+advantage of compiler builtins to avoid expensive calls to libc.
+This improves performance of these operations by about a factor of two on gnu
+libc systems.
+
+Performance improvements to `sqrt` and `abs`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The `sqrt` and `abs` functions for unit stride elementary operations have been
+improved to make use of SSE2 CPU SIMD instructions.
+This improves performance of these operations up to 4x/2x for float32/float64
+depending on the location of the data in the CPU caches. The performance gain
+is greatest for in-place operations.
+In order to use the improved functions the SSE2 instruction set must be enabled
+at compile time. It is enabled by default on x86_64 systems. On x86_32 with a
+capable CPU it must be enabled by passing the appropriate flag to CFLAGS build
+variable (-msse2 with gcc).
 
 Changes
 =======
diff --git a/numpy/core/bscript b/numpy/core/bscript
index 44f7f14f0..ee5543354 100644
--- a/numpy/core/bscript
+++ b/numpy/core/bscript
@@ -492,8 +492,10 @@ def pre_build(context):
             pattern="ufunc_api",
             name="ufunc_api")
 
-        ufunc_templates = ["src/umath/loops.c.src",
-                "src/umath/funcs.inc.src"]
+        ufunc_templates = [
+            "src/umath/loops.c.src",
+            "src/umath/funcs.inc.src",
+            "src/umath/simd.inc.src"]
         bld(target="ufunc_templates", source=ufunc_templates)
 
         bld(features="umath_gen",
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index a6ddfb843..926142b55 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -837,7 +837,9 @@ def configuration(parent_package='',top_path=None):
         subpath = join('src', 'umath')
         # NOTE: For manual template conversion of loops.h.src, read the note
         #       in that file.
-        sources = [join(local_dir, subpath, 'loops.c.src')]
+        sources = [
+            join(local_dir, subpath, 'loops.c.src'),
+            join(local_dir, subpath, 'simd.inc.src')]
 
         # numpy.distutils generate .c from .c.src in weird directories, we have
         # to add them there as they depend on the build_dir
@@ -864,12 +866,14 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'umathmodule.c'),
             join('src', 'umath', 'reduction.c'),
             join('src', 'umath', 'funcs.inc.src'),
+            join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'ufunc_object.c'),
             join('src', 'umath', 'ufunc_type_resolution.c')]
 
     umath_deps = [
             generate_umath_py,
+            join('src', 'umath', 'simd.inc.src'),
             join(codegen_dir,'generate_ufunc_api.py')]
 
     if not ENABLE_SEPARATE_COMPILATION:
@@ -877,6 +881,7 @@ def configuration(parent_package='',top_path=None):
         umath_src = [join('src', 'umath', 'umathmodule_onefile.c')]
         umath_src.append(generate_umath_templated_sources)
         umath_src.append(join('src', 'umath', 'funcs.inc.src'))
+        umath_src.append(join('src', 'umath', 'simd.inc.src'))
 
     config.add_extension('umath',
                          sources = umath_src +
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 687c62987..c0287b8c8 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -19,11 +19,15 @@
 #include "npy_pycompat.h"
 
 #include "ufunc_object.h"
-#include <assert.h>
 
-#ifdef HAVE_EMMINTRIN_H
-#include <emmintrin.h>
-#endif
+
+/*
+ * include vectorized functions and dispatchers
+ * this file is safe to include also for generic builds
+ * platform specific instructions are either masked via the proprocessor or
+ * runtime detected
+ */
+#include "simd.inc"
 
 
 /*
@@ -37,15 +41,6 @@
         && (steps[0] == steps[2])\
         && (steps[0] == 0))
 
-/*
- * stride is equal to element size and input and destination are equal or
- * don't overlap within one register
- */
-#define IS_BLOCKABLE_UNARY(esize, vsize) \
-    (steps[0] == (esize) && steps[0] == steps[1] && \
-     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
-     ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0))))
-
 #define OUTPUT_LOOP\
     char *op1 = args[1];\
     npy_intp os1 = steps[1];\
@@ -67,22 +62,6 @@
     npy_intp i;\
     for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
 
-/* align output to alignment
- * store alignment is usually more important than load alignment */
-#define UNARY_LOOP_BLOCK_ALIGN_OUT(type, alignment)\
-    type *ip1 = (type *)args[0], *op1 = (type *)args[1];\
-    npy_intp n = dimensions[0];\
-    npy_intp i, peel = npy_aligned_block_offset(op1, sizeof(type),\
-                                                alignment, n);\
-    for(i = 0; i < peel; i++)
-
-#define UNARY_LOOP_BLOCKED(type, vsize)\
-    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
-            i += (vsize / sizeof(type)))
-
-#define UNARY_LOOP_BLOCKED_END\
-    for (; i < n; i++)
-
 #define BINARY_LOOP\
     char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
@@ -1297,48 +1276,25 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
  *****************************************************************************
  */
 
-
 /**begin repeat
+ * Float types
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
  *  #scalarf = npy_sqrtf, npy_sqrt#
- *  #vtype = __m128, __m128d#
- *  #vsuf = ps, pd#
  */
+
 NPY_NO_EXPORT void
 @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
-#ifdef HAVE_EMMINTRIN_H
-    if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
-        UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
-            op1[i] = @scalarf@(ip1[i]);
-        }
-        assert(npy_is_aligned(&op1[i], 16));
-        if (npy_is_aligned(&ip1[i], 16)) {
-            UNARY_LOOP_BLOCKED(@type@, 16) {
-                @vtype@ d = _mm_load_@vsuf@(&ip1[i]);
-                _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
-            }
-        }
-        else {
-            UNARY_LOOP_BLOCKED(@type@, 16) {
-                @vtype@ d = _mm_loadu_@vsuf@(&ip1[i]);
-                _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
-            }
-        }
-        UNARY_LOOP_BLOCKED_END {
-            op1[i] = @scalarf@(ip1[i]);
-        }
+    if (run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
+        return;
     }
-    else
-#endif
-    {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = @scalarf@(in1);
-        }
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *(@type@ *)op1 = @scalarf@(in1);
     }
 }
+
 /**end repeat**/
 
 
@@ -1567,6 +1523,9 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
+    if (run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
+        return;
+    }
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ tmp = in1 > 0 ? in1 : -in1;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
new file mode 100644
index 000000000..916473a0b
--- /dev/null
+++ b/numpy/core/src/umath/simd.inc.src
@@ -0,0 +1,173 @@
+/* -*- c -*- */
+
+/*
+ * This file is for the definitions of simd vectorized operations.
+ *
+ * Currently contains sse2 functions that are built on amd64, x32 or
+ * non-generic builds (CFLAGS=-march=...)
+ * In future it may contain other instruction sets like AVX or NEON detected
+ * at runtime in which case it needs to be included indirectly via a file
+ * compiled with special options (or use gcc target attributes) so the binary
+ * stays portable.
+ */
+
+
+#ifndef __NPY_SIMD_INC
+#define __NPY_SIMD_INC
+
+#include "lowlevel_strided_loops.h"
+#include "npy_config.h"
+#include <assert.h>
+#include <stdlib.h>
+
+/*
+ * stride is equal to element size and input and destination are equal or
+ * don't overlap within one register
+ */
+#define IS_BLOCKABLE_UNARY(esize, vsize) \
+    (steps[0] == (esize) && steps[0] == steps[1] && \
+     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+     ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0))))
+
+
+/* align var to alignment */
+#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+    npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
+                                                alignment, n);\
+    for(i = 0; i < peel; i++)
+
+#define UNARY_LOOP_BLOCKED(type, vsize)\
+    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
+            i += (vsize / sizeof(type)))
+
+#define UNARY_LOOP_BLOCKED_END\
+    for (; i < n; i++)
+
+
+/*
+ * Dispatcher functions
+ * decide whether the operation can be vectorized and run it
+ * if it was run returns true and false if nothing was done
+ */
+
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double, npy_longdouble#
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ *  #vector = 1, 1, 0#
+ */
+
+/**begin repeat1
+ * #func = sqrt, absolute#
+ */
+
+#if @vector@
+
+/* prototypes */
+static void
+sse2_@func@_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && defined HAVE_EMMINTRIN_H
+    if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
+        sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/*
+ * Vectorized operations
+ */
+
+/**begin repeat
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #c = f, #
+ *  #vtype = __m128, __m128d#
+ *  #vpre = _mm, _mm#
+ *  #vsuf = ps, pd#
+ */
+
+#ifdef HAVE_EMMINTRIN_H
+#include <emmintrin.h>
+
+
+static void
+sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
+{
+    /* align output to 16 bytes */
+    UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+        op[i] = @scalarf@(ip[i]);
+    }
+    assert(npy_is_aligned(&op[i], 16));
+    if (npy_is_aligned(&ip[i], 16)) {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
+        }
+    }
+    else {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
+        }
+    }
+    UNARY_LOOP_BLOCKED_END {
+        op[i] = @scalarf@(ip[i]);
+    }
+}
+
+
+static void
+sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
+{
+    /*
+     * get 0x7FFFFFFF mask (everything but signbit set)
+     * float & ~mask will remove the sign
+     * this is equivalent to how the compiler implements fabs on amd64
+     */
+    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
+
+    /* align output to 16 bytes */
+    UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+        const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
+        /* add 0 to clear -0.0 */
+        op[i] = tmp + 0;
+    }
+    assert(npy_is_aligned(&op[i], 16));
+    if (npy_is_aligned(&ip[i], 16)) {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
+        }
+    }
+    else {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
+        }
+    }
+    UNARY_LOOP_BLOCKED_END {
+        const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
+        /* add 0 to clear -0.0 */
+        op[i] = tmp + 0;
+    }
+}
+
+#endif
+
+
+/**end repeat**/
+
+#endif
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 586b64636..6bbb15e6b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -687,6 +687,32 @@ class TestSign(TestCase):
             np.seterr(**olderr)
 
 
+class TestAbsolute(TestCase):
+    def test_abs_blocked(self):
+        "simd tests on abs"
+        for dt in [np.float32, np.float64]:
+            for out, inp, msg in gen_alignment_data(dtype=dt, type='unary',
+                                                    max_size=17):
+                tgt = [ncu.absolute(i) for i in inp]
+                np.absolute(inp, out=out)
+                assert_equal(out, tgt, err_msg=msg)
+                self.assertTrue((out >= 0).all())
+
+                prev = np.geterr()
+                # will throw invalid flag depending on compiler optimizations
+                np.seterr(invalid='ignore')
+                for v in [np.nan, -np.inf, np.inf]:
+                    for i in range(inp.size):
+                        d = np.arange(inp.size, dtype=dt)
+                        inp[:] = -d
+                        inp[i] = v
+                        d[i] = -v if v == -np.inf else v
+                        assert_array_equal(np.abs(inp), d, err_msg=msg)
+                        np.abs(inp, out=out)
+                        assert_array_equal(out, d, err_msg=msg)
+                np.seterr(invalid=prev['invalid'])
+
+
 class TestSpecialMethods(TestCase):
     def test_wrap(self):
         class with_wrap(object):
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index 7a3ea7a1c..2d8d57c5f 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -1580,13 +1580,13 @@ def gen_alignment_data(dtype=float32, type='binary', max_size=24):
                     (o, o, o, s, dtype, 'in place2')
                 yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % \
                     (o + 1, o, o, s - 1, dtype, 'out of place')
-                yield out[-1:], inp1()[1:], inp2()[:-1], bfmt % \
+                yield out[:-1], inp1()[1:], inp2()[:-1], bfmt % \
                     (o, o + 1, o, s - 1, dtype, 'out of place')
-                yield out[-1:], inp1()[:-1], inp2()[1:], bfmt % \
+                yield out[:-1], inp1()[:-1], inp2()[1:], bfmt % \
                     (o, o, o + 1, s - 1, dtype, 'out of place')
                 yield inp1()[1:], inp1()[:-1], inp2()[:-1], bfmt % \
                     (o + 1, o, o, s - 1, dtype, 'aliased')
-                yield inp1()[-1:], inp1()[1:], inp2()[:-1], bfmt % \
+                yield inp1()[:-1], inp1()[1:], inp2()[:-1], bfmt % \
                     (o, o + 1, o, s - 1, dtype, 'aliased')
-                yield inp1()[-1:], inp1()[:-1], inp2()[1:], bfmt % \
+                yield inp1()[:-1], inp1()[:-1], inp2()[1:], bfmt % \
                     (o, o, o + 1, s - 1, dtype, 'aliased')
author	Charles Harris <charlesr.harris@gmail.com>	2013-06-08 14:57:15 -0700
committer	Charles Harris <charlesr.harris@gmail.com>	2013-06-08 14:57:15 -0700
commit	3e471304393244eb02215d1601d2396b3f94eb8d (patch)
tree	c837f5bd2272256f361cb2ec6b0d04a84b023a8d
parent	23a27572994e1c731605c6a78a1564014c2a62c8 (diff)
parent	7fb8b714906a92516905cc0f03e45511bd1ac1ed (diff)
download	numpy-3e471304393244eb02215d1601d2396b3f94eb8d.tar.gz