ENH: optimize byteswapping via intrinsics

Byteswapping is implemented in hardware on x86 cpus which is more than double as fast than the existing implementation. The masked version used for 32bit plays better with compiler pattern matching so it can be optimized also by non-gcc/clang compilers.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2013-05-06 19:24:10 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2013-05-15 18:53:12 +0200
commit: 99cb95f7379a95f978fd04d183fde2d262b262a6 (patch)
tree: 0e08c6a2d07b3032f8ce5609093aad3214ca1a50 /numpy
parent: 0337cf2f8cf555912dd39a2767a0f7f9e6398257 (diff)
download: numpy-99cb95f7379a95f978fd04d183fde2d262b262a6.tar.gz
2 files changed, 24 insertions, 8 deletions
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index a1a9ac9af..e778e507b 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -101,7 +101,10 @@ OPTIONAL_STDFUNCS = ["expm1", "log1p", "acosh", "asinh", "atanh",
 # call arguments are required as the compiler will do strict signature checking
 OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_isinf", '5.'),
-                       ("__builtin_isfinite", '5.')]
+                       ("__builtin_isfinite", '5.'),
+                       ("__builtin_bswap32", '5u'),
+                       ("__builtin_bswap64", '5u'),
+                       ]
 
 # Subset of OPTIONAL_STDFUNCS which may alreay have HAVE_* defined by Python.h
 OPTIONAL_STDFUNCS_MAYBE = ["expm1", "log1p", "acosh", "atanh", "asinh", "hypot",
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index f366a34b1..b1a9d9859 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -311,25 +311,38 @@ _strided_byte_swap(void *p, npy_intp stride, npy_intp n, int size)
     case 1: /* no byteswap necessary */
         break;
     case 4:
-        for (a = (char*)p; n > 0; n--, a += stride - 1) {
-            b = a + 3;
-            c = *a; *a++ = *b; *b-- = c;
-            c = *a; *a = *b; *b   = c;
+        for (a = (char*)p; n > 0; n--, a += stride) {
+            npy_uint32 * a_ = (npy_uint32 *)a;
+#ifdef HAVE___BUILTIN_BSWAP32
+            *a_ = __builtin_bswap32(*a_);
+#else
+            /* a decent compiler can convert this to bswap too */
+            *a_ = ((*a_ & 0xff000000u) >> 24) | ((*a_ & 0x00ff0000u) >>  8) |
+                  ((*a_ & 0x0000ff00u) <<  8) | ((*a_ & 0x000000ffu) << 24);
+#endif
         }
         break;
     case 8:
-        for (a = (char*)p; n > 0; n--, a += stride - 3) {
+        for (a = (char*)p; n > 0; n--) {
+#ifdef HAVE___BUILTIN_BSWAP64
+            npy_uint64 * a_ = (npy_uint64 *)a;
+            *a_ = __builtin_bswap64(*a_);
+            a += stride;
+#else
+            /* mask version would be faster but requires C99 */
             b = a + 7;
             c = *a; *a++ = *b; *b-- = c;
             c = *a; *a++ = *b; *b-- = c;
             c = *a; *a++ = *b; *b-- = c;
             c = *a; *a = *b; *b   = c;
+            a += stride - 3;
+#endif
         }
         break;
     case 2:
         for (a = (char*)p; n > 0; n--, a += stride) {
-            b = a + 1;
-            c = *a; *a = *b; *b = c;
+            npy_uint16 * a_ = (npy_uint16 *)a;
+            *a_ = (((*a_ >> 8) & 0xffu) | ((*a_ & 0xffu) << 8));
         }
         break;
     default:
author	Julian Taylor <jtaylor.debian@googlemail.com>	2013-05-06 19:24:10 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2013-05-15 18:53:12 +0200
commit	99cb95f7379a95f978fd04d183fde2d262b262a6 (patch)
tree	0e08c6a2d07b3032f8ce5609093aad3214ca1a50 /numpy
parent	0337cf2f8cf555912dd39a2767a0f7f9e6398257 (diff)
download	numpy-99cb95f7379a95f978fd04d183fde2d262b262a6.tar.gz