summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorJulian Taylor <jtaylor.debian@googlemail.com>2013-05-06 19:24:10 +0200
committerJulian Taylor <jtaylor.debian@googlemail.com>2013-05-15 18:53:12 +0200
commit99cb95f7379a95f978fd04d183fde2d262b262a6 (patch)
tree0e08c6a2d07b3032f8ce5609093aad3214ca1a50 /numpy
parent0337cf2f8cf555912dd39a2767a0f7f9e6398257 (diff)
downloadnumpy-99cb95f7379a95f978fd04d183fde2d262b262a6.tar.gz
ENH: optimize byteswapping via intrinsics
Byteswapping is implemented in hardware on x86 cpus which is more than double as fast than the existing implementation. The masked version used for 32bit plays better with compiler pattern matching so it can be optimized also by non-gcc/clang compilers.
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/setup_common.py5
-rw-r--r--numpy/core/src/multiarray/ctors.c27
2 files changed, 24 insertions, 8 deletions
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index a1a9ac9af..e778e507b 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -101,7 +101,10 @@ OPTIONAL_STDFUNCS = ["expm1", "log1p", "acosh", "asinh", "atanh",
# call arguments are required as the compiler will do strict signature checking
OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
("__builtin_isinf", '5.'),
- ("__builtin_isfinite", '5.')]
+ ("__builtin_isfinite", '5.'),
+ ("__builtin_bswap32", '5u'),
+ ("__builtin_bswap64", '5u'),
+ ]
# Subset of OPTIONAL_STDFUNCS which may alreay have HAVE_* defined by Python.h
OPTIONAL_STDFUNCS_MAYBE = ["expm1", "log1p", "acosh", "atanh", "asinh", "hypot",
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index f366a34b1..b1a9d9859 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -311,25 +311,38 @@ _strided_byte_swap(void *p, npy_intp stride, npy_intp n, int size)
case 1: /* no byteswap necessary */
break;
case 4:
- for (a = (char*)p; n > 0; n--, a += stride - 1) {
- b = a + 3;
- c = *a; *a++ = *b; *b-- = c;
- c = *a; *a = *b; *b = c;
+ for (a = (char*)p; n > 0; n--, a += stride) {
+ npy_uint32 * a_ = (npy_uint32 *)a;
+#ifdef HAVE___BUILTIN_BSWAP32
+ *a_ = __builtin_bswap32(*a_);
+#else
+ /* a decent compiler can convert this to bswap too */
+ *a_ = ((*a_ & 0xff000000u) >> 24) | ((*a_ & 0x00ff0000u) >> 8) |
+ ((*a_ & 0x0000ff00u) << 8) | ((*a_ & 0x000000ffu) << 24);
+#endif
}
break;
case 8:
- for (a = (char*)p; n > 0; n--, a += stride - 3) {
+ for (a = (char*)p; n > 0; n--) {
+#ifdef HAVE___BUILTIN_BSWAP64
+ npy_uint64 * a_ = (npy_uint64 *)a;
+ *a_ = __builtin_bswap64(*a_);
+ a += stride;
+#else
+ /* mask version would be faster but requires C99 */
b = a + 7;
c = *a; *a++ = *b; *b-- = c;
c = *a; *a++ = *b; *b-- = c;
c = *a; *a++ = *b; *b-- = c;
c = *a; *a = *b; *b = c;
+ a += stride - 3;
+#endif
}
break;
case 2:
for (a = (char*)p; n > 0; n--, a += stride) {
- b = a + 1;
- c = *a; *a = *b; *b = c;
+ npy_uint16 * a_ = (npy_uint16 *)a;
+ *a_ = (((*a_ >> 8) & 0xffu) | ((*a_ & 0xffu) << 8));
}
break;
default: