ENH: improve vectorization of float comparisons

Unroll and use pack instructions instead of slowish movemask with lookup table. Doubles performance AMD phenoms and about 30% on Intel haswells. Core2 also gain a few percent.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2014-03-03 23:14:33 +0100
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2014-03-04 00:02:48 +0100
commit: 8fca578e615979def5333325068629922c0e415a (patch)
tree: c81b205a7ec6cd20c4696668c85cd66abc36cc37
parent: d4c7c3a69a0dc2458c876dd17a15b1a18b179fd8 (diff)
download: numpy-8fca578e615979def5333325068629922c0e415a.tar.gz
1 files changed, 68 insertions, 60 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 554fc199a..79f8c7a44 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -84,34 +84,6 @@
 #define LOOP_BLOCKED_END\
     for (; i < n; i++)
 
-/* fanout two bits to two bytes */
-static const npy_int16 fanout_2[] = {
-    0x0000,
-    0x0001,
-    0x0100,
-    0x0101,
-};
-
-/* fanout four bits to four bytes */
-static const npy_int32 fanout_4[] = {
-    0x00000000,
-    0x00000001,
-    0x00000100,
-    0x00000101,
-    0x00010000,
-    0x00010001,
-    0x00010100,
-    0x00010101,
-    0x01000000,
-    0x01000001,
-    0x01000100,
-    0x01000101,
-    0x01010000,
-    0x01010001,
-    0x01010100,
-    0x01010101
-};
-
 
 /*
  * Dispatcher functions
@@ -381,8 +353,8 @@ static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
  *  #vsuf = ps, pd#
  *  #vsufs = ss, sd#
  *  #nan = NPY_NANF, NPY_NAN#
- *  #mtype = npy_int32, npy_int16#
- *  #fanout = fanout_4, fanout_2#
+ *  #double = 0, 1#
+ *  #cast = _mm_castps_si128, _mm_castpd_si128#
  */
 
 
@@ -529,19 +501,33 @@ sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
 static void
 sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
-    npy_bool * r;
+    const __m128i mask = @vpre@_set1_epi8(0x1);
     LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
     }
-    r = &op[i];
-    LOOP_BLOCKED(@type@, 16) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-        @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-        @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-        @mtype@ ir = @fanout@[@vpre@_movemask_@vsuf@(c)];
-        /* may be unaligned */
-        memcpy(r, &ir, sizeof(ir));
-        r += sizeof(ir);
+    LOOP_BLOCKED(@type@, 64) {
+        @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
+        @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
+        @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
+        @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+        @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
+        @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
+        @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
+        @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
+        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
+        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
+        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
+        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
+        __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
+        __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(r4));
+        __m128i rr = @vpre@_packs_epi16(ir1, ir2);
+        rr = @vpre@_and_si128(rr, mask);
+#if @double@
+        rr = @vpre@_packs_epi16(rr, rr);
+        @vpre@_storel_epi64((__m128i*)&op[i], rr);
+#else
+        @vpre@_storeu_si128((__m128i*)&op[i], rr);
+#endif
     }
     LOOP_BLOCKED_END {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
@@ -552,19 +538,30 @@ sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 static void
 sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
-    npy_bool * r;
-    @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
+    @vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
+    const __m128i mask = @vpre@_set1_epi8(0x1);
     LOOP_BLOCK_ALIGN_VAR(ip2, @type@, 16) {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
     }
-    r = &op[i];
-    LOOP_BLOCKED(@type@, 16) {
-        @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-        @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-        @mtype@ ir = @fanout@[@vpre@_movemask_@vsuf@(c)];
-        /* may be unaligned */
-        memcpy(r, &ir, sizeof(ir));
-        r += sizeof(ir);
+    LOOP_BLOCKED(@type@, 64) {
+        @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
+        @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
+        @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
+        @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
+        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a);
+        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
+        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
+        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(s, d);
+        __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
+        __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(r4));
+        __m128i rr = @vpre@_packs_epi16(ir1, ir2);
+        rr = @vpre@_and_si128(rr, mask);
+#if @double@
+        rr = @vpre@_packs_epi16(rr, rr);
+        @vpre@_storel_epi64((__m128i*)&op[i], rr);
+#else
+        @vpre@_storeu_si128((__m128i*)&op[i], rr);
+#endif
     }
     LOOP_BLOCKED_END {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
@@ -575,19 +572,30 @@ sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy
 static void
 sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
-    npy_bool * r;
-    @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
+    @vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
+    const __m128i mask = @vpre@_set1_epi8(0x1);
     LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
     }
-    r = &op[i];
-    LOOP_BLOCKED(@type@, 16) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-        @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-        @mtype@ ir = @fanout@[@vpre@_movemask_@vsuf@(c)];
-        /* may be unaligned */
-        memcpy(r, &ir, sizeof(ir));
-        r += sizeof(ir);
+    LOOP_BLOCKED(@type@, 64) {
+        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
+        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
+        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
+        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s);
+        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
+        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
+        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, s);
+        __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
+        __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(r4));
+        __m128i rr = @vpre@_packs_epi16(ir1, ir2);
+        rr = @vpre@_and_si128(rr, mask);
+#if @double@
+        rr = @vpre@_packs_epi16(rr, rr);
+        @vpre@_storel_epi64((__m128i*)&op[i], rr);
+#else
+        @vpre@_storeu_si128((__m128i*)&op[i], rr);
+#endif
     }
     LOOP_BLOCKED_END {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
author	Julian Taylor <jtaylor.debian@googlemail.com>	2014-03-03 23:14:33 +0100
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2014-03-04 00:02:48 +0100
commit	8fca578e615979def5333325068629922c0e415a (patch)
tree	c81b205a7ec6cd20c4696668c85cd66abc36cc37
parent	d4c7c3a69a0dc2458c876dd17a15b1a18b179fd8 (diff)
download	numpy-8fca578e615979def5333325068629922c0e415a.tar.gz