diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-05-06 19:24:10 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-05-15 18:53:12 +0200 |
commit | 99cb95f7379a95f978fd04d183fde2d262b262a6 (patch) | |
tree | 0e08c6a2d07b3032f8ce5609093aad3214ca1a50 /numpy | |
parent | 0337cf2f8cf555912dd39a2767a0f7f9e6398257 (diff) | |
download | numpy-99cb95f7379a95f978fd04d183fde2d262b262a6.tar.gz |
ENH: optimize byteswapping via intrinsics
Byteswapping is implemented in hardware on x86 cpus which is more than
double as fast than the existing implementation.
The masked version used for 32bit plays better with compiler pattern
matching so it can be optimized also by non-gcc/clang compilers.
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/setup_common.py | 5 | ||||
-rw-r--r-- | numpy/core/src/multiarray/ctors.c | 27 |
2 files changed, 24 insertions, 8 deletions
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index a1a9ac9af..e778e507b 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -101,7 +101,10 @@ OPTIONAL_STDFUNCS = ["expm1", "log1p", "acosh", "asinh", "atanh", # call arguments are required as the compiler will do strict signature checking OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'), ("__builtin_isinf", '5.'), - ("__builtin_isfinite", '5.')] + ("__builtin_isfinite", '5.'), + ("__builtin_bswap32", '5u'), + ("__builtin_bswap64", '5u'), + ] # Subset of OPTIONAL_STDFUNCS which may alreay have HAVE_* defined by Python.h OPTIONAL_STDFUNCS_MAYBE = ["expm1", "log1p", "acosh", "atanh", "asinh", "hypot", diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c index f366a34b1..b1a9d9859 100644 --- a/numpy/core/src/multiarray/ctors.c +++ b/numpy/core/src/multiarray/ctors.c @@ -311,25 +311,38 @@ _strided_byte_swap(void *p, npy_intp stride, npy_intp n, int size) case 1: /* no byteswap necessary */ break; case 4: - for (a = (char*)p; n > 0; n--, a += stride - 1) { - b = a + 3; - c = *a; *a++ = *b; *b-- = c; - c = *a; *a = *b; *b = c; + for (a = (char*)p; n > 0; n--, a += stride) { + npy_uint32 * a_ = (npy_uint32 *)a; +#ifdef HAVE___BUILTIN_BSWAP32 + *a_ = __builtin_bswap32(*a_); +#else + /* a decent compiler can convert this to bswap too */ + *a_ = ((*a_ & 0xff000000u) >> 24) | ((*a_ & 0x00ff0000u) >> 8) | + ((*a_ & 0x0000ff00u) << 8) | ((*a_ & 0x000000ffu) << 24); +#endif } break; case 8: - for (a = (char*)p; n > 0; n--, a += stride - 3) { + for (a = (char*)p; n > 0; n--) { +#ifdef HAVE___BUILTIN_BSWAP64 + npy_uint64 * a_ = (npy_uint64 *)a; + *a_ = __builtin_bswap64(*a_); + a += stride; +#else + /* mask version would be faster but requires C99 */ b = a + 7; c = *a; *a++ = *b; *b-- = c; c = *a; *a++ = *b; *b-- = c; c = *a; *a++ = *b; *b-- = c; c = *a; *a = *b; *b = c; + a += stride - 3; +#endif } break; case 2: for (a = (char*)p; n > 0; n--, a += stride) { - b = a + 1; - c = *a; *a = *b; *b = c; + npy_uint16 * a_ = (npy_uint16 *)a; + *a_ = (((*a_ >> 8) & 0xffu) | ((*a_ & 0xffu) << 8)); } break; default: |