diff options
-rw-r--r-- | numpy/core/src/multiarray/lowlevel_strided_loops.c.src | 33 | ||||
-rw-r--r-- | numpy/core/src/private/lowlevel_strided_loops.h | 76 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 5 |
3 files changed, 82 insertions, 32 deletions
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src index ef29b855e..5c02c6e9f 100644 --- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src @@ -36,27 +36,16 @@ #define _NPY_NOP4(x) (x) #define _NPY_NOP8(x) (x) -#define _NPY_SWAP2(x) (((((npy_uint16)x)&0xffu) << 8) | \ - (((npy_uint16)x) >> 8)) +#define _NPY_SWAP2(x) npy_bswap2(x) -#define _NPY_SWAP4(x) (((((npy_uint32)x)&0xffu) << 24) | \ - ((((npy_uint32)x)&0xff00u) << 8) | \ - ((((npy_uint32)x)&0xff0000u) >> 8) | \ - (((npy_uint32)x) >> 24)) +#define _NPY_SWAP4(x) npy_bswap4(x) #define _NPY_SWAP_PAIR4(x) (((((npy_uint32)x)&0xffu) << 8) | \ ((((npy_uint32)x)&0xff00u) >> 8) | \ ((((npy_uint32)x)&0xff0000u) << 8) | \ ((((npy_uint32)x)&0xff000000u) >> 8)) -#define _NPY_SWAP8(x) (((((npy_uint64)x)&0xffULL) << 56) | \ - ((((npy_uint64)x)&0xff00ULL) << 40) | \ - ((((npy_uint64)x)&0xff0000ULL) << 24) | \ - ((((npy_uint64)x)&0xff000000ULL) << 8) | \ - ((((npy_uint64)x)&0xff00000000ULL) >> 8) | \ - ((((npy_uint64)x)&0xff0000000000ULL) >> 24) | \ - ((((npy_uint64)x)&0xff000000000000ULL) >> 40) | \ - (((npy_uint64)x) >> 56)) +#define _NPY_SWAP8(x) npy_bswap8(x) #define _NPY_SWAP_PAIR8(x) (((((npy_uint64)x)&0xffULL) << 24) | \ ((((npy_uint64)x)&0xff00ULL) << 8) | \ @@ -67,21 +56,11 @@ ((((npy_uint64)x)&0xff000000000000ULL) >> 8) | \ ((((npy_uint64)x)&0xff00000000000000ULL) >> 24)) -#define _NPY_SWAP_INPLACE2(x) { \ - char a = (x)[0]; (x)[0] = (x)[1]; (x)[1] = a; \ - } +#define _NPY_SWAP_INPLACE2(x) npy_bswap2_unaligned(x) -#define _NPY_SWAP_INPLACE4(x) { \ - char a = (x)[0]; (x)[0] = (x)[3]; (x)[3] = a; \ - a = (x)[1]; (x)[1] = (x)[2]; (x)[2] = a; \ - } +#define _NPY_SWAP_INPLACE4(x) npy_bswap4_unaligned(x) -#define _NPY_SWAP_INPLACE8(x) { \ - char a = (x)[0]; (x)[0] = (x)[7]; (x)[7] = a; \ - a = (x)[1]; (x)[1] = (x)[6]; (x)[6] = a; \ - a = (x)[2]; (x)[2] = (x)[5]; (x)[5] = a; \ - a = (x)[3]; (x)[3] = (x)[4]; (x)[4] = a; \ - } +#define _NPY_SWAP_INPLACE8(x) npy_bswap8_unaligned(x) #define _NPY_SWAP_INPLACE16(x) { \ char a = (x)[0]; (x)[0] = (x)[15]; (x)[15] = a; \ diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h index 742882a92..fffd02e03 100644 --- a/numpy/core/src/private/lowlevel_strided_loops.h +++ b/numpy/core/src/private/lowlevel_strided_loops.h @@ -1,5 +1,6 @@ #ifndef __LOWLEVEL_STRIDED_LOOPS_H #define __LOWLEVEL_STRIDED_LOOPS_H +#include <npy_config.h> /* * NOTE: This API should remain private for the time being, to allow @@ -396,6 +397,14 @@ PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp *shape, char **out_dataB, npy_intp *out_stridesB, char **out_dataC, npy_intp *out_stridesC); +/* + * return true if pointer is aligned to 'alignment' + */ +static NPY_INLINE int +npy_is_aligned(const void * p, const npy_uintp alignment) +{ + return ((npy_uintp)(p) & ((alignment) - 1)) == 0; +} /* * Return number of elements that must be peeled from @@ -441,6 +450,73 @@ npy_blocked_end(const npy_intp offset, const npy_intp esize, } +/* byte swapping functions */ +static NPY_INLINE npy_uint16 +npy_bswap2(npy_uint16 x) +{ + return ((x & 0xffu) << 8) | (x >> 8); +} + +/* + * treat as int16 and byteswap unaligned memory, + * some cpus don't support unaligned access + */ +static NPY_INLINE void +npy_bswap2_unaligned(char * x) +{ + char a = x[0]; + x[0] = x[1]; + x[1] = a; +} + +static NPY_INLINE npy_uint32 +npy_bswap4(npy_uint32 x) +{ +#ifdef HAVE___BUILTIN_BSWAP32 + return __builtin_bswap32(x); +#else + return ((x & 0xffu) << 24) | ((x & 0xff00u) << 8) | + ((x & 0xff0000u) >> 8) | (x >> 24); +#endif +} + +static NPY_INLINE void +npy_bswap4_unaligned(char * x) +{ + char a = x[0]; + x[0] = x[3]; + x[3] = a; + a = x[1]; + x[1] = x[2]; + x[2] = a; +} + +static NPY_INLINE npy_uint64 +npy_bswap8(npy_uint64 x) +{ +#ifdef HAVE___BUILTIN_BSWAP64 + return __builtin_bswap64(x); +#else + return ((x & 0xffULL) << 56) | + ((x & 0xff00ULL) << 40) | + ((x & 0xff0000ULL) << 24) | + ((x & 0xff000000ULL) << 8) | + ((x & 0xff00000000ULL) >> 8) | + ((x & 0xff0000000000ULL) >> 24) | + ((x & 0xff000000000000ULL) >> 40) | + ( x >> 56); +#endif +} + +static NPY_INLINE void +npy_bswap8_unaligned(char * x) +{ + char a = x[0]; x[0] = x[7]; x[7] = a; + a = x[1]; x[1] = x[6]; x[6] = a; + a = x[2]; x[2] = x[5]; x[5] = a; + a = x[3]; x[3] = x[4]; x[4] = a; +} + /* Start raw iteration */ #define NPY_RAW_ITER_START(idim, ndim, coord, shape) \ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index e307faa46..5eae448ee 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -34,11 +34,6 @@ */ -static NPY_INLINE int npy_is_aligned(const void * p, const npy_intp alignment) -{ - return ((npy_intp)(p) & ((alignment) - 1)) == 0; -} - #define IS_BINARY_REDUCE ((args[0] == args[2])\ && (steps[0] == steps[2])\ && (steps[0] == 0)) |