diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2013-05-29 11:27:51 -0700 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2013-05-29 11:27:51 -0700 |
commit | 9303985bbe08935420d6b39f60d6f81043e7a57f (patch) | |
tree | 5a5831a394a300d1f0ca35292979621c089952b8 | |
parent | d3a7c893439c31ef287daf817dbd0cbe9c5ac16b (diff) | |
parent | 0258a3e65ba9fea8892aaa9761bf7e37c999c8fd (diff) | |
download | numpy-9303985bbe08935420d6b39f60d6f81043e7a57f.tar.gz |
Merge pull request #3376 from juliantaylor/unaligned-access
Unaligned access
-rw-r--r-- | numpy/core/src/multiarray/array_assign.c | 3 | ||||
-rw-r--r-- | numpy/core/src/multiarray/array_assign_scalar.c | 6 | ||||
-rw-r--r-- | numpy/core/src/multiarray/ctors.c | 56 | ||||
-rw-r--r-- | numpy/core/src/multiarray/lowlevel_strided_loops.c.src | 33 | ||||
-rw-r--r-- | numpy/core/src/private/lowlevel_strided_loops.h | 76 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 5 |
6 files changed, 117 insertions, 62 deletions
diff --git a/numpy/core/src/multiarray/array_assign.c b/numpy/core/src/multiarray/array_assign.c index 6467b6cfd..fa764d758 100644 --- a/numpy/core/src/multiarray/array_assign.c +++ b/numpy/core/src/multiarray/array_assign.c @@ -22,6 +22,7 @@ #include "array_assign.h" #include "common.h" +#include "lowlevel_strided_loops.h" /* See array_assign.h for parameter documentation */ NPY_NO_EXPORT int @@ -92,7 +93,7 @@ raw_array_is_aligned(int ndim, char *data, npy_intp *strides, int alignment) align_check |= strides[idim]; } - return ((align_check & (alignment - 1)) == 0); + return npy_is_aligned((void *)align_check, alignment); } else { return 1; diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c index 2c1154264..df7facad6 100644 --- a/numpy/core/src/multiarray/array_assign_scalar.c +++ b/numpy/core/src/multiarray/array_assign_scalar.c @@ -48,7 +48,7 @@ raw_array_assign_scalar(int ndim, npy_intp *shape, /* Check alignment */ aligned = raw_array_is_aligned(ndim, dst_data, dst_strides, dst_dtype->alignment); - if (((npy_intp)src_data & (src_dtype->alignment - 1)) != 0) { + if (!npy_is_aligned(src_data, src_dtype->alignment)) { aligned = 0; } @@ -119,7 +119,7 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp *shape, /* Check alignment */ aligned = raw_array_is_aligned(ndim, dst_data, dst_strides, dst_dtype->alignment); - if (((npy_intp)src_data & (src_dtype->alignment - 1)) != 0) { + if (!npy_is_aligned(src_data, src_dtype->alignment)) { aligned = 0; } @@ -220,7 +220,7 @@ PyArray_AssignRawScalar(PyArrayObject *dst, * we also skip this if 'dst' has an object dtype. */ if ((!PyArray_EquivTypes(PyArray_DESCR(dst), src_dtype) || - ((npy_intp)src_data & (src_dtype->alignment - 1)) != 0) && + !npy_is_aligned(src_data, src_dtype->alignment)) && PyArray_SIZE(dst) > 1 && !PyDataType_REFCHK(PyArray_DESCR(dst))) { char *tmp_src_data; diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c index b1a9d9859..5c692bd02 100644 --- a/numpy/core/src/multiarray/ctors.c +++ b/numpy/core/src/multiarray/ctors.c @@ -311,38 +311,42 @@ _strided_byte_swap(void *p, npy_intp stride, npy_intp n, int size) case 1: /* no byteswap necessary */ break; case 4: - for (a = (char*)p; n > 0; n--, a += stride) { - npy_uint32 * a_ = (npy_uint32 *)a; -#ifdef HAVE___BUILTIN_BSWAP32 - *a_ = __builtin_bswap32(*a_); -#else - /* a decent compiler can convert this to bswap too */ - *a_ = ((*a_ & 0xff000000u) >> 24) | ((*a_ & 0x00ff0000u) >> 8) | - ((*a_ & 0x0000ff00u) << 8) | ((*a_ & 0x000000ffu) << 24); -#endif + if (npy_is_aligned(p, sizeof(npy_uint32))) { + for (a = (char*)p; n > 0; n--, a += stride) { + npy_uint32 * a_ = (npy_uint32 *)a; + *a_ = npy_bswap4(*a_); + } + } + else { + for (a = (char*)p; n > 0; n--, a += stride) { + npy_bswap4_unaligned(a); + } } break; case 8: - for (a = (char*)p; n > 0; n--) { -#ifdef HAVE___BUILTIN_BSWAP64 - npy_uint64 * a_ = (npy_uint64 *)a; - *a_ = __builtin_bswap64(*a_); - a += stride; -#else - /* mask version would be faster but requires C99 */ - b = a + 7; - c = *a; *a++ = *b; *b-- = c; - c = *a; *a++ = *b; *b-- = c; - c = *a; *a++ = *b; *b-- = c; - c = *a; *a = *b; *b = c; - a += stride - 3; -#endif + if (npy_is_aligned(p, sizeof(npy_uint64))) { + for (a = (char*)p; n > 0; n--, a += stride) { + npy_uint64 * a_ = (npy_uint64 *)a; + *a_ = npy_bswap8(*a_); + } + } + else { + for (a = (char*)p; n > 0; n--, a += stride) { + npy_bswap8_unaligned(a); + } } break; case 2: - for (a = (char*)p; n > 0; n--, a += stride) { - npy_uint16 * a_ = (npy_uint16 *)a; - *a_ = (((*a_ >> 8) & 0xffu) | ((*a_ & 0xffu) << 8)); + if (npy_is_aligned(p, sizeof(npy_uint16))) { + for (a = (char*)p; n > 0; n--, a += stride) { + npy_uint16 * a_ = (npy_uint16 *)a; + *a_ = npy_bswap2(*a_); + } + } + else { + for (a = (char*)p; n > 0; n--, a += stride) { + npy_bswap2_unaligned(a); + } } break; default: diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src index ef29b855e..5c02c6e9f 100644 --- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src @@ -36,27 +36,16 @@ #define _NPY_NOP4(x) (x) #define _NPY_NOP8(x) (x) -#define _NPY_SWAP2(x) (((((npy_uint16)x)&0xffu) << 8) | \ - (((npy_uint16)x) >> 8)) +#define _NPY_SWAP2(x) npy_bswap2(x) -#define _NPY_SWAP4(x) (((((npy_uint32)x)&0xffu) << 24) | \ - ((((npy_uint32)x)&0xff00u) << 8) | \ - ((((npy_uint32)x)&0xff0000u) >> 8) | \ - (((npy_uint32)x) >> 24)) +#define _NPY_SWAP4(x) npy_bswap4(x) #define _NPY_SWAP_PAIR4(x) (((((npy_uint32)x)&0xffu) << 8) | \ ((((npy_uint32)x)&0xff00u) >> 8) | \ ((((npy_uint32)x)&0xff0000u) << 8) | \ ((((npy_uint32)x)&0xff000000u) >> 8)) -#define _NPY_SWAP8(x) (((((npy_uint64)x)&0xffULL) << 56) | \ - ((((npy_uint64)x)&0xff00ULL) << 40) | \ - ((((npy_uint64)x)&0xff0000ULL) << 24) | \ - ((((npy_uint64)x)&0xff000000ULL) << 8) | \ - ((((npy_uint64)x)&0xff00000000ULL) >> 8) | \ - ((((npy_uint64)x)&0xff0000000000ULL) >> 24) | \ - ((((npy_uint64)x)&0xff000000000000ULL) >> 40) | \ - (((npy_uint64)x) >> 56)) +#define _NPY_SWAP8(x) npy_bswap8(x) #define _NPY_SWAP_PAIR8(x) (((((npy_uint64)x)&0xffULL) << 24) | \ ((((npy_uint64)x)&0xff00ULL) << 8) | \ @@ -67,21 +56,11 @@ ((((npy_uint64)x)&0xff000000000000ULL) >> 8) | \ ((((npy_uint64)x)&0xff00000000000000ULL) >> 24)) -#define _NPY_SWAP_INPLACE2(x) { \ - char a = (x)[0]; (x)[0] = (x)[1]; (x)[1] = a; \ - } +#define _NPY_SWAP_INPLACE2(x) npy_bswap2_unaligned(x) -#define _NPY_SWAP_INPLACE4(x) { \ - char a = (x)[0]; (x)[0] = (x)[3]; (x)[3] = a; \ - a = (x)[1]; (x)[1] = (x)[2]; (x)[2] = a; \ - } +#define _NPY_SWAP_INPLACE4(x) npy_bswap4_unaligned(x) -#define _NPY_SWAP_INPLACE8(x) { \ - char a = (x)[0]; (x)[0] = (x)[7]; (x)[7] = a; \ - a = (x)[1]; (x)[1] = (x)[6]; (x)[6] = a; \ - a = (x)[2]; (x)[2] = (x)[5]; (x)[5] = a; \ - a = (x)[3]; (x)[3] = (x)[4]; (x)[4] = a; \ - } +#define _NPY_SWAP_INPLACE8(x) npy_bswap8_unaligned(x) #define _NPY_SWAP_INPLACE16(x) { \ char a = (x)[0]; (x)[0] = (x)[15]; (x)[15] = a; \ diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h index 742882a92..fffd02e03 100644 --- a/numpy/core/src/private/lowlevel_strided_loops.h +++ b/numpy/core/src/private/lowlevel_strided_loops.h @@ -1,5 +1,6 @@ #ifndef __LOWLEVEL_STRIDED_LOOPS_H #define __LOWLEVEL_STRIDED_LOOPS_H +#include <npy_config.h> /* * NOTE: This API should remain private for the time being, to allow @@ -396,6 +397,14 @@ PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp *shape, char **out_dataB, npy_intp *out_stridesB, char **out_dataC, npy_intp *out_stridesC); +/* + * return true if pointer is aligned to 'alignment' + */ +static NPY_INLINE int +npy_is_aligned(const void * p, const npy_uintp alignment) +{ + return ((npy_uintp)(p) & ((alignment) - 1)) == 0; +} /* * Return number of elements that must be peeled from @@ -441,6 +450,73 @@ npy_blocked_end(const npy_intp offset, const npy_intp esize, } +/* byte swapping functions */ +static NPY_INLINE npy_uint16 +npy_bswap2(npy_uint16 x) +{ + return ((x & 0xffu) << 8) | (x >> 8); +} + +/* + * treat as int16 and byteswap unaligned memory, + * some cpus don't support unaligned access + */ +static NPY_INLINE void +npy_bswap2_unaligned(char * x) +{ + char a = x[0]; + x[0] = x[1]; + x[1] = a; +} + +static NPY_INLINE npy_uint32 +npy_bswap4(npy_uint32 x) +{ +#ifdef HAVE___BUILTIN_BSWAP32 + return __builtin_bswap32(x); +#else + return ((x & 0xffu) << 24) | ((x & 0xff00u) << 8) | + ((x & 0xff0000u) >> 8) | (x >> 24); +#endif +} + +static NPY_INLINE void +npy_bswap4_unaligned(char * x) +{ + char a = x[0]; + x[0] = x[3]; + x[3] = a; + a = x[1]; + x[1] = x[2]; + x[2] = a; +} + +static NPY_INLINE npy_uint64 +npy_bswap8(npy_uint64 x) +{ +#ifdef HAVE___BUILTIN_BSWAP64 + return __builtin_bswap64(x); +#else + return ((x & 0xffULL) << 56) | + ((x & 0xff00ULL) << 40) | + ((x & 0xff0000ULL) << 24) | + ((x & 0xff000000ULL) << 8) | + ((x & 0xff00000000ULL) >> 8) | + ((x & 0xff0000000000ULL) >> 24) | + ((x & 0xff000000000000ULL) >> 40) | + ( x >> 56); +#endif +} + +static NPY_INLINE void +npy_bswap8_unaligned(char * x) +{ + char a = x[0]; x[0] = x[7]; x[7] = a; + a = x[1]; x[1] = x[6]; x[6] = a; + a = x[2]; x[2] = x[5]; x[5] = a; + a = x[3]; x[3] = x[4]; x[4] = a; +} + /* Start raw iteration */ #define NPY_RAW_ITER_START(idim, ndim, coord, shape) \ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index e307faa46..5eae448ee 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -34,11 +34,6 @@ */ -static NPY_INLINE int npy_is_aligned(const void * p, const npy_intp alignment) -{ - return ((npy_intp)(p) & ((alignment) - 1)) == 0; -} - #define IS_BINARY_REDUCE ((args[0] == args[2])\ && (steps[0] == steps[2])\ && (steps[0] == 0)) |