summaryrefslogtreecommitdiff
path: root/numpy/core/src
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2013-07-08 15:26:54 -0700
committerCharles Harris <charlesr.harris@gmail.com>2013-07-08 15:26:54 -0700
commitf1c776657467391781767780bb0a783d24bb8d50 (patch)
tree56dccd21a5ab8e433fc251a82c844ba45b0abc3b /numpy/core/src
parentcfe411b6cccb177003c99fb780917f97f4be38e9 (diff)
parent01a9081e7791f19d65f73e623a5dfeec52243be3 (diff)
downloadnumpy-f1c776657467391781767780bb0a783d24bb8d50.tar.gz
Merge pull request #3507 from juliantaylor/vectorize-cmp
ENH: vectorize boolean comparisons of floats
Diffstat (limited to 'numpy/core/src')
-rw-r--r--numpy/core/src/umath/loops.c.src3
-rw-r--r--numpy/core/src/umath/simd.inc.src183
2 files changed, 185 insertions, 1 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index a99bdb9d2..0559fb416 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1351,6 +1351,9 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
+ if (run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
+ return;
+ }
BINARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index dc3b6ad8e..98e2beb30 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -24,6 +24,7 @@
#endif
#include <assert.h>
#include <stdlib.h>
+#include <string.h> /* for memcpy */
int PyUFunc_getfperr(void);
void PyUFunc_clearfperr(void);
@@ -59,6 +60,19 @@ void PyUFunc_clearfperr(void);
((abs(args[2] - args[0]) >= (vsize)) || (abs(args[2] - args[0]) == 0)) && \
abs(args[2] - args[1]) >= (esize))
+#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
+ (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
+ npy_is_aligned(args[1], (esize)) && \
+ npy_is_aligned(args[0], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
+ (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
+ npy_is_aligned(args[1], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
+ (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
+ npy_is_aligned(args[0], (esize)))
+
/* align var to alignment */
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
@@ -72,6 +86,35 @@ void PyUFunc_clearfperr(void);
#define LOOP_BLOCKED_END\
for (; i < n; i++)
+/* fanout two bits to two bytes */
+static const npy_int16 fanout_2[] = {
+ 0x0000,
+ 0x0001,
+ 0x0100,
+ 0x0101,
+};
+
+/* fanout four bits to four bytes */
+static const npy_int32 fanout_4[] = {
+ 0x00000000,
+ 0x00000001,
+ 0x00000100,
+ 0x00000101,
+ 0x00010000,
+ 0x00010001,
+ 0x00010100,
+ 0x00010101,
+ 0x01000000,
+ 0x01000001,
+ 0x01000100,
+ 0x01000101,
+ 0x01010000,
+ 0x01010001,
+ 0x01010100,
+ 0x01010101
+};
+
+
/*
* Dispatcher functions
* decide whether the operation can be vectorized and run it
@@ -122,7 +165,6 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
/**begin repeat1
* Arithmetic
* # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
*/
#if @vector@ && defined HAVE_EMMINTRIN_H
@@ -168,6 +210,55 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
/**end repeat1**/
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
+ * logical_and, logical_or#
+ * #simd = 1, 1, 1, 1, 1, 1, 0, 0#
+ */
+
+#if @vector@ && @simd@ && defined HAVE_EMMINTRIN_H
+
+/* prototypes */
+static void
+sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
+ npy_intp n);
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
+ npy_intp n);
+static void
+sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
+ npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && @simd@ && defined HAVE_EMMINTRIN_H
+ @type@ * ip1 = (@type@ *)args[0];
+ @type@ * ip2 = (@type@ *)args[1];
+ npy_bool * op = (npy_bool *)args[2];
+ npy_intp n = dimensions[0];
+ /* argument one scalar */
+ if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), 16)) {
+ sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+ /* argument two scalar */
+ else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), 16)) {
+ sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+ else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), 16)) {
+ sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/**end repeat1**/
+
/**end repeat**/
/*
@@ -281,7 +372,10 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
* #vtype = __m128, __m128d#
* #vpre = _mm, _mm#
* #vsuf = ps, pd#
+ * #vsufs = ss, sd#
* #nan = NPY_NANF, NPY_NAN#
+ * #mtype = npy_int32, npy_int16#
+ * #fanout = fanout_4, fanout_2#
*/
@@ -407,6 +501,93 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
/**end repeat1**/
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
+*/
+
+/* sets invalid fpu flag on QNaN for consistency with packed compare */
+static NPY_INLINE int
+sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
+{
+ @type@ tmp;
+ @vtype@ v = @vpre@_@VOP@_@vsufs@(@vpre@_load_@vsufs@(&a),
+ @vpre@_load_@vsufs@(&b));
+ @vpre@_store_@vsufs@(&tmp, v);
+ return sizeof(@type@) == 4 ?
+ (*(npy_uint32 *)&tmp) & 1 : (*(npy_uint64 *)&tmp) & 1;
+}
+
+static void
+sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+ npy_bool * r;
+ LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+ op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
+ }
+ r = &op[i];
+ LOOP_BLOCKED(@type@, 16) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+ @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @mtype@ ir = @fanout@[@vpre@_movemask_@vsuf@(c)];
+ /* may be unaligned */
+ memcpy(r, &ir, sizeof(ir));
+ r += sizeof(ir);
+ }
+ LOOP_BLOCKED_END {
+ op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
+ }
+}
+
+
+static void
+sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+ npy_bool * r;
+ @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
+ LOOP_BLOCK_ALIGN_VAR(ip2, @type@, 16) {
+ op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
+ }
+ r = &op[i];
+ LOOP_BLOCKED(@type@, 16) {
+ @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @mtype@ ir = @fanout@[@vpre@_movemask_@vsuf@(c)];
+ /* may be unaligned */
+ memcpy(r, &ir, sizeof(ir));
+ r += sizeof(ir);
+ }
+ LOOP_BLOCKED_END {
+ op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
+ }
+}
+
+
+static void
+sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+ npy_bool * r;
+ @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
+ LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+ op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
+ }
+ r = &op[i];
+ LOOP_BLOCKED(@type@, 16) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+ @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
+ @mtype@ ir = @fanout@[@vpre@_movemask_@vsuf@(c)];
+ /* may be unaligned */
+ memcpy(r, &ir, sizeof(ir));
+ r += sizeof(ir);
+ }
+ LOOP_BLOCKED_END {
+ op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
+ }
+}
+/**end repeat1**/
+
static void
sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
{