diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2013-07-08 15:26:54 -0700 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2013-07-08 15:26:54 -0700 |
commit | f1c776657467391781767780bb0a783d24bb8d50 (patch) | |
tree | 56dccd21a5ab8e433fc251a82c844ba45b0abc3b /numpy/core/src | |
parent | cfe411b6cccb177003c99fb780917f97f4be38e9 (diff) | |
parent | 01a9081e7791f19d65f73e623a5dfeec52243be3 (diff) | |
download | numpy-f1c776657467391781767780bb0a783d24bb8d50.tar.gz |
Merge pull request #3507 from juliantaylor/vectorize-cmp
ENH: vectorize boolean comparisons of floats
Diffstat (limited to 'numpy/core/src')
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 3 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 183 |
2 files changed, 185 insertions, 1 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index a99bdb9d2..0559fb416 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1351,6 +1351,9 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { + if (run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) { + return; + } BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index dc3b6ad8e..98e2beb30 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -24,6 +24,7 @@ #endif #include <assert.h> #include <stdlib.h> +#include <string.h> /* for memcpy */ int PyUFunc_getfperr(void); void PyUFunc_clearfperr(void); @@ -59,6 +60,19 @@ void PyUFunc_clearfperr(void); ((abs(args[2] - args[0]) >= (vsize)) || (abs(args[2] - args[0]) == 0)) && \ abs(args[2] - args[1]) >= (esize)) +#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \ + (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \ + npy_is_aligned(args[1], (esize)) && \ + npy_is_aligned(args[0], (esize))) + +#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \ + (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \ + npy_is_aligned(args[1], (esize))) + +#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \ + (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \ + npy_is_aligned(args[0], (esize))) + /* align var to alignment */ #define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\ npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\ @@ -72,6 +86,35 @@ void PyUFunc_clearfperr(void); #define LOOP_BLOCKED_END\ for (; i < n; i++) +/* fanout two bits to two bytes */ +static const npy_int16 fanout_2[] = { + 0x0000, + 0x0001, + 0x0100, + 0x0101, +}; + +/* fanout four bits to four bytes */ +static const npy_int32 fanout_4[] = { + 0x00000000, + 0x00000001, + 0x00000100, + 0x00000101, + 0x00010000, + 0x00010001, + 0x00010100, + 0x00010101, + 0x01000000, + 0x01000001, + 0x01000100, + 0x01000101, + 0x01010000, + 0x01010001, + 0x01010100, + 0x01010101 +}; + + /* * Dispatcher functions * decide whether the operation can be vectorized and run it @@ -122,7 +165,6 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps /**begin repeat1 * Arithmetic * # kind = add, subtract, multiply, divide# - * # OP = +, -, *, /# */ #if @vector@ && defined HAVE_EMMINTRIN_H @@ -168,6 +210,55 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps /**end repeat1**/ +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal, + * logical_and, logical_or# + * #simd = 1, 1, 1, 1, 1, 1, 0, 0# + */ + +#if @vector@ && @simd@ && defined HAVE_EMMINTRIN_H + +/* prototypes */ +static void +sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, + npy_intp n); +static void +sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, + npy_intp n); +static void +sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, + npy_intp n); + +#endif + +static NPY_INLINE int +run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +{ +#if @vector@ && @simd@ && defined HAVE_EMMINTRIN_H + @type@ * ip1 = (@type@ *)args[0]; + @type@ * ip2 = (@type@ *)args[1]; + npy_bool * op = (npy_bool *)args[2]; + npy_intp n = dimensions[0]; + /* argument one scalar */ + if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), 16)) { + sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + /* argument two scalar */ + else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), 16)) { + sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), 16)) { + sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } +#endif + return 0; +} + +/**end repeat1**/ + /**end repeat**/ /* @@ -281,7 +372,10 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) * #vtype = __m128, __m128d# * #vpre = _mm, _mm# * #vsuf = ps, pd# + * #vsufs = ss, sd# * #nan = NPY_NANF, NPY_NAN# + * #mtype = npy_int32, npy_int16# + * #fanout = fanout_4, fanout_2# */ @@ -407,6 +501,93 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i /**end repeat1**/ +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #OP = ==, !=, <, <=, >, >=# + * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge# +*/ + +/* sets invalid fpu flag on QNaN for consistency with packed compare */ +static NPY_INLINE int +sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b) +{ + @type@ tmp; + @vtype@ v = @vpre@_@VOP@_@vsufs@(@vpre@_load_@vsufs@(&a), + @vpre@_load_@vsufs@(&b)); + @vpre@_store_@vsufs@(&tmp, v); + return sizeof(@type@) == 4 ? + (*(npy_uint32 *)&tmp) & 1 : (*(npy_uint64 *)&tmp) & 1; +} + +static void +sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + npy_bool * r; + LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) { + op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]); + } + r = &op[i]; + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @mtype@ ir = @fanout@[@vpre@_movemask_@vsuf@(c)]; + /* may be unaligned */ + memcpy(r, &ir, sizeof(ir)); + r += sizeof(ir); + } + LOOP_BLOCKED_END { + op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]); + } +} + + +static void +sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + npy_bool * r; + @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]); + LOOP_BLOCK_ALIGN_VAR(ip2, @type@, 16) { + op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]); + } + r = &op[i]; + LOOP_BLOCKED(@type@, 16) { + @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @mtype@ ir = @fanout@[@vpre@_movemask_@vsuf@(c)]; + /* may be unaligned */ + memcpy(r, &ir, sizeof(ir)); + r += sizeof(ir); + } + LOOP_BLOCKED_END { + op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]); + } +} + + +static void +sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + npy_bool * r; + @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]); + LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) { + op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]); + } + r = &op[i]; + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); + @mtype@ ir = @fanout@[@vpre@_movemask_@vsuf@(c)]; + /* may be unaligned */ + memcpy(r, &ir, sizeof(ir)); + r += sizeof(ir); + } + LOOP_BLOCKED_END { + op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]); + } +} +/**end repeat1**/ + static void sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n) { |