diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2013-06-08 14:57:15 -0700 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2013-06-08 14:57:15 -0700 |
commit | 3e471304393244eb02215d1601d2396b3f94eb8d (patch) | |
tree | c837f5bd2272256f361cb2ec6b0d04a84b023a8d | |
parent | 23a27572994e1c731605c6a78a1564014c2a62c8 (diff) | |
parent | 7fb8b714906a92516905cc0f03e45511bd1ac1ed (diff) | |
download | numpy-3e471304393244eb02215d1601d2396b3f94eb8d.tar.gz |
Merge pull request #3411 from juliantaylor/vectorize-fabs
ENH: Vectorize float absolute operation with sse2
-rw-r--r-- | doc/release/1.8.0-notes.rst | 18 | ||||
-rw-r--r-- | numpy/core/bscript | 6 | ||||
-rw-r--r-- | numpy/core/setup.py | 7 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 79 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 173 | ||||
-rw-r--r-- | numpy/core/tests/test_umath.py | 26 | ||||
-rw-r--r-- | numpy/testing/utils.py | 8 |
7 files changed, 250 insertions, 67 deletions
diff --git a/doc/release/1.8.0-notes.rst b/doc/release/1.8.0-notes.rst index 76dcf50c2..c5315e6cd 100644 --- a/doc/release/1.8.0-notes.rst +++ b/doc/release/1.8.0-notes.rst @@ -142,6 +142,24 @@ The `pad` function has a new implementation, greatly improving performance for all inputs except `mode=<function>` (retained for backwards compatibility). Scaling with dimensionality is dramatically improved for rank >= 4. +Performance improvements to `isnan`, `isinf`, `isfinite` and `byteswap` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`isnan`, `isinf`, `isfinite` and `byteswap` have been improved to take +advantage of compiler builtins to avoid expensive calls to libc. +This improves performance of these operations by about a factor of two on gnu +libc systems. + +Performance improvements to `sqrt` and `abs` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The `sqrt` and `abs` functions for unit stride elementary operations have been +improved to make use of SSE2 CPU SIMD instructions. +This improves performance of these operations up to 4x/2x for float32/float64 +depending on the location of the data in the CPU caches. The performance gain +is greatest for in-place operations. +In order to use the improved functions the SSE2 instruction set must be enabled +at compile time. It is enabled by default on x86_64 systems. On x86_32 with a +capable CPU it must be enabled by passing the appropriate flag to CFLAGS build +variable (-msse2 with gcc). Changes ======= diff --git a/numpy/core/bscript b/numpy/core/bscript index 44f7f14f0..ee5543354 100644 --- a/numpy/core/bscript +++ b/numpy/core/bscript @@ -492,8 +492,10 @@ def pre_build(context): pattern="ufunc_api", name="ufunc_api") - ufunc_templates = ["src/umath/loops.c.src", - "src/umath/funcs.inc.src"] + ufunc_templates = [ + "src/umath/loops.c.src", + "src/umath/funcs.inc.src", + "src/umath/simd.inc.src"] bld(target="ufunc_templates", source=ufunc_templates) bld(features="umath_gen", diff --git a/numpy/core/setup.py b/numpy/core/setup.py index a6ddfb843..926142b55 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -837,7 +837,9 @@ def configuration(parent_package='',top_path=None): subpath = join('src', 'umath') # NOTE: For manual template conversion of loops.h.src, read the note # in that file. - sources = [join(local_dir, subpath, 'loops.c.src')] + sources = [ + join(local_dir, subpath, 'loops.c.src'), + join(local_dir, subpath, 'simd.inc.src')] # numpy.distutils generate .c from .c.src in weird directories, we have # to add them there as they depend on the build_dir @@ -864,12 +866,14 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'umathmodule.c'), join('src', 'umath', 'reduction.c'), join('src', 'umath', 'funcs.inc.src'), + join('src', 'umath', 'simd.inc.src'), join('src', 'umath', 'loops.c.src'), join('src', 'umath', 'ufunc_object.c'), join('src', 'umath', 'ufunc_type_resolution.c')] umath_deps = [ generate_umath_py, + join('src', 'umath', 'simd.inc.src'), join(codegen_dir,'generate_ufunc_api.py')] if not ENABLE_SEPARATE_COMPILATION: @@ -877,6 +881,7 @@ def configuration(parent_package='',top_path=None): umath_src = [join('src', 'umath', 'umathmodule_onefile.c')] umath_src.append(generate_umath_templated_sources) umath_src.append(join('src', 'umath', 'funcs.inc.src')) + umath_src.append(join('src', 'umath', 'simd.inc.src')) config.add_extension('umath', sources = umath_src + diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 687c62987..c0287b8c8 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -19,11 +19,15 @@ #include "npy_pycompat.h" #include "ufunc_object.h" -#include <assert.h> -#ifdef HAVE_EMMINTRIN_H -#include <emmintrin.h> -#endif + +/* + * include vectorized functions and dispatchers + * this file is safe to include also for generic builds + * platform specific instructions are either masked via the proprocessor or + * runtime detected + */ +#include "simd.inc" /* @@ -37,15 +41,6 @@ && (steps[0] == steps[2])\ && (steps[0] == 0)) -/* - * stride is equal to element size and input and destination are equal or - * don't overlap within one register - */ -#define IS_BLOCKABLE_UNARY(esize, vsize) \ - (steps[0] == (esize) && steps[0] == steps[1] && \ - (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ - ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0)))) - #define OUTPUT_LOOP\ char *op1 = args[1];\ npy_intp os1 = steps[1];\ @@ -67,22 +62,6 @@ npy_intp i;\ for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2) -/* align output to alignment - * store alignment is usually more important than load alignment */ -#define UNARY_LOOP_BLOCK_ALIGN_OUT(type, alignment)\ - type *ip1 = (type *)args[0], *op1 = (type *)args[1];\ - npy_intp n = dimensions[0];\ - npy_intp i, peel = npy_aligned_block_offset(op1, sizeof(type),\ - alignment, n);\ - for(i = 0; i < peel; i++) - -#define UNARY_LOOP_BLOCKED(type, vsize)\ - for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\ - i += (vsize / sizeof(type))) - -#define UNARY_LOOP_BLOCKED_END\ - for (; i < n; i++) - #define BINARY_LOOP\ char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\ npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\ @@ -1297,48 +1276,25 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void * ***************************************************************************** */ - /**begin repeat + * Float types * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# * #scalarf = npy_sqrtf, npy_sqrt# - * #vtype = __m128, __m128d# - * #vsuf = ps, pd# */ + NPY_NO_EXPORT void @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { -#ifdef HAVE_EMMINTRIN_H - if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) { - UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) { - op1[i] = @scalarf@(ip1[i]); - } - assert(npy_is_aligned(&op1[i], 16)); - if (npy_is_aligned(&ip1[i], 16)) { - UNARY_LOOP_BLOCKED(@type@, 16) { - @vtype@ d = _mm_load_@vsuf@(&ip1[i]); - _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d)); - } - } - else { - UNARY_LOOP_BLOCKED(@type@, 16) { - @vtype@ d = _mm_loadu_@vsuf@(&ip1[i]); - _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d)); - } - } - UNARY_LOOP_BLOCKED_END { - op1[i] = @scalarf@(ip1[i]); - } + if (run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) { + return; } - else -#endif - { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - } + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = @scalarf@(in1); } } + /**end repeat**/ @@ -1567,6 +1523,9 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { + if (run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) { + return; + } UNARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ tmp = in1 > 0 ? in1 : -in1; diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src new file mode 100644 index 000000000..916473a0b --- /dev/null +++ b/numpy/core/src/umath/simd.inc.src @@ -0,0 +1,173 @@ +/* -*- c -*- */ + +/* + * This file is for the definitions of simd vectorized operations. + * + * Currently contains sse2 functions that are built on amd64, x32 or + * non-generic builds (CFLAGS=-march=...) + * In future it may contain other instruction sets like AVX or NEON detected + * at runtime in which case it needs to be included indirectly via a file + * compiled with special options (or use gcc target attributes) so the binary + * stays portable. + */ + + +#ifndef __NPY_SIMD_INC +#define __NPY_SIMD_INC + +#include "lowlevel_strided_loops.h" +#include "npy_config.h" +#include <assert.h> +#include <stdlib.h> + +/* + * stride is equal to element size and input and destination are equal or + * don't overlap within one register + */ +#define IS_BLOCKABLE_UNARY(esize, vsize) \ + (steps[0] == (esize) && steps[0] == steps[1] && \ + (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ + ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0)))) + + +/* align var to alignment */ +#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\ + npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\ + alignment, n);\ + for(i = 0; i < peel; i++) + +#define UNARY_LOOP_BLOCKED(type, vsize)\ + for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\ + i += (vsize / sizeof(type))) + +#define UNARY_LOOP_BLOCKED_END\ + for (; i < n; i++) + + +/* + * Dispatcher functions + * decide whether the operation can be vectorized and run it + * if it was run returns true and false if nothing was done + */ + +/**begin repeat + * Float types + * #type = npy_float, npy_double, npy_longdouble# + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #vector = 1, 1, 0# + */ + +/**begin repeat1 + * #func = sqrt, absolute# + */ + +#if @vector@ + +/* prototypes */ +static void +sse2_@func@_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n); + +#endif + +static NPY_INLINE int +run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +{ +#if @vector@ && defined HAVE_EMMINTRIN_H + if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) { + sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]); + return 1; + } +#endif + return 0; +} +/**end repeat1**/ + +/**end repeat**/ + + +/* + * Vectorized operations + */ + +/**begin repeat + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #scalarf = npy_sqrtf, npy_sqrt# + * #c = f, # + * #vtype = __m128, __m128d# + * #vpre = _mm, _mm# + * #vsuf = ps, pd# + */ + +#ifdef HAVE_EMMINTRIN_H +#include <emmintrin.h> + + +static void +sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n) +{ + /* align output to 16 bytes */ + UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) { + op[i] = @scalarf@(ip[i]); + } + assert(npy_is_aligned(&op[i], 16)); + if (npy_is_aligned(&ip[i], 16)) { + UNARY_LOOP_BLOCKED(@type@, 16) { + @vtype@ d = @vpre@_load_@vsuf@(&ip[i]); + @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d)); + } + } + else { + UNARY_LOOP_BLOCKED(@type@, 16) { + @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]); + @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d)); + } + } + UNARY_LOOP_BLOCKED_END { + op[i] = @scalarf@(ip[i]); + } +} + + +static void +sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n) +{ + /* + * get 0x7FFFFFFF mask (everything but signbit set) + * float & ~mask will remove the sign + * this is equivalent to how the compiler implements fabs on amd64 + */ + const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@); + + /* align output to 16 bytes */ + UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) { + const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i]; + /* add 0 to clear -0.0 */ + op[i] = tmp + 0; + } + assert(npy_is_aligned(&op[i], 16)); + if (npy_is_aligned(&ip[i], 16)) { + UNARY_LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_load_@vsuf@(&ip[i]); + @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a)); + } + } + else { + UNARY_LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]); + @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a)); + } + } + UNARY_LOOP_BLOCKED_END { + const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i]; + /* add 0 to clear -0.0 */ + op[i] = tmp + 0; + } +} + +#endif + + +/**end repeat**/ + +#endif diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 586b64636..6bbb15e6b 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -687,6 +687,32 @@ class TestSign(TestCase): np.seterr(**olderr) +class TestAbsolute(TestCase): + def test_abs_blocked(self): + "simd tests on abs" + for dt in [np.float32, np.float64]: + for out, inp, msg in gen_alignment_data(dtype=dt, type='unary', + max_size=17): + tgt = [ncu.absolute(i) for i in inp] + np.absolute(inp, out=out) + assert_equal(out, tgt, err_msg=msg) + self.assertTrue((out >= 0).all()) + + prev = np.geterr() + # will throw invalid flag depending on compiler optimizations + np.seterr(invalid='ignore') + for v in [np.nan, -np.inf, np.inf]: + for i in range(inp.size): + d = np.arange(inp.size, dtype=dt) + inp[:] = -d + inp[i] = v + d[i] = -v if v == -np.inf else v + assert_array_equal(np.abs(inp), d, err_msg=msg) + np.abs(inp, out=out) + assert_array_equal(out, d, err_msg=msg) + np.seterr(invalid=prev['invalid']) + + class TestSpecialMethods(TestCase): def test_wrap(self): class with_wrap(object): diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py index 7a3ea7a1c..2d8d57c5f 100644 --- a/numpy/testing/utils.py +++ b/numpy/testing/utils.py @@ -1580,13 +1580,13 @@ def gen_alignment_data(dtype=float32, type='binary', max_size=24): (o, o, o, s, dtype, 'in place2') yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % \ (o + 1, o, o, s - 1, dtype, 'out of place') - yield out[-1:], inp1()[1:], inp2()[:-1], bfmt % \ + yield out[:-1], inp1()[1:], inp2()[:-1], bfmt % \ (o, o + 1, o, s - 1, dtype, 'out of place') - yield out[-1:], inp1()[:-1], inp2()[1:], bfmt % \ + yield out[:-1], inp1()[:-1], inp2()[1:], bfmt % \ (o, o, o + 1, s - 1, dtype, 'out of place') yield inp1()[1:], inp1()[:-1], inp2()[:-1], bfmt % \ (o + 1, o, o, s - 1, dtype, 'aliased') - yield inp1()[-1:], inp1()[1:], inp2()[:-1], bfmt % \ + yield inp1()[:-1], inp1()[1:], inp2()[:-1], bfmt % \ (o, o + 1, o, s - 1, dtype, 'aliased') - yield inp1()[-1:], inp1()[:-1], inp2()[1:], bfmt % \ + yield inp1()[:-1], inp1()[:-1], inp2()[1:], bfmt % \ (o, o, o + 1, s - 1, dtype, 'aliased') |