diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2016-11-29 00:19:21 +0100 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2017-01-12 17:17:07 +0100 |
commit | f0f7ad80f2ef2d7525965dfe27c0e2ab68647197 (patch) | |
tree | 885b69a2c89ca924395da8f908f5ba2c48383864 | |
parent | 7e6091c9a3fc4536ccbadb337e88650b2c901313 (diff) | |
download | numpy-f0f7ad80f2ef2d7525965dfe27c0e2ab68647197.tar.gz |
ENH: vectorize packbits with SSE2
SSE2 has a special instruction to pack bytes into bits,
available as the intrinsic _mm_movemask_epi8. It is significantly faster
than the per byte loop currently being used.
Unfortunately packbits is bitwise "big endian", the most significant bit
is the first in the input byte while _mm_movemask_epi is little endian
so we need to byteswap the input first. But it is still about 8-10 times
faster than the scalar code.
-rw-r--r-- | numpy/core/setup_common.py | 2 | ||||
-rw-r--r-- | numpy/core/src/multiarray/compiled_base.c | 36 |
2 files changed, 36 insertions, 2 deletions
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index 18066d991..596b3996c 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -130,6 +130,8 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'), # broken on OSX 10.11, make sure its not optimized away ("volatile int r = __builtin_cpu_supports", '"sse"', "stdio.h", "__BUILTIN_CPU_SUPPORTS"), + # MMX only needed for icc, but some clangs don't have it + ("_m_from_int64", '0', "emmintrin.h"), ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE ("_mm_prefetch", '(float*)0, _MM_HINT_NTA', "xmmintrin.h"), # SSE diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c index f2323d9e2..e4838ec1c 100644 --- a/numpy/core/src/multiarray/compiled_base.c +++ b/numpy/core/src/multiarray/compiled_base.c @@ -9,6 +9,7 @@ #include "numpy/npy_math.h" #include "npy_config.h" #include "templ_common.h" /* for npy_mul_with_overflow_intp */ +#include "lowlevel_strided_loops.h" /* for npy_bswap8 */ /* @@ -1475,6 +1476,9 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args) Py_RETURN_NONE; } +#if defined NPY_HAVE_SSE2_INTRINSICS +#include <emmintrin.h> +#endif /* * This function packs boolean values in the input array into the bits of a @@ -1497,13 +1501,41 @@ pack_inner(const char *inptr, * No: move on * Every 8th value, set the value of build and increment the outptr */ - npy_intp index; + npy_intp index = 0; int remain = n_in % 8; /* uneven bits */ +#if defined NPY_HAVE_SSE2_INTRINSICS && defined HAVE__M_FROM_INT64 + if (in_stride == 1 && element_size == 1 && n_out > 2) { + __m128i zero = _mm_setzero_si128(); + /* don't handle non-full 8-byte remainder */ + npy_intp vn_out = n_out - (remain ? 1 : 0); + vn_out -= (vn_out & 2); + for (index = 0; index < vn_out; index += 2) { + unsigned int r; + /* swap as packbits is "big endian", note x86 can load unaligned */ + npy_uint64 a = npy_bswap8(*(npy_uint64*)inptr); + npy_uint64 b = npy_bswap8(*(npy_uint64*)(inptr + 8)); + __m128i v = _mm_set_epi64(_m_from_int64(b), _m_from_int64(a)); + /* false -> 0x00 and true -> 0xFF (there is no cmpneq) */ + v = _mm_cmpeq_epi8(v, zero); + v = _mm_cmpeq_epi8(v, zero); + /* extract msb of 16 bytes and pack it into 16 bit */ + r = _mm_movemask_epi8(v); + /* store result */ + memcpy(outptr, &r, 1); + outptr += out_stride; + memcpy(outptr, (char*)&r + 1, 1); + outptr += out_stride; + inptr += 16; + } + } +#endif + if (remain == 0) { /* assumes n_in > 0 */ remain = 8; } - for (index = 0; index < n_out; index++) { + /* don't reset index to handle remainder of above block */ + for (; index < n_out; index++) { char build = 0; int i, maxi; npy_intp j; |