diff options
author | Julian Taylor <juliantaylor108@gmail.com> | 2019-05-12 18:35:18 +0200 |
---|---|---|
committer | Julian Taylor <juliantaylor108@gmail.com> | 2019-05-12 22:40:51 +0200 |
commit | 886da7998d0cdc115cd010671a904390a7275810 (patch) | |
tree | 90fe9cfb094deebd92c820e764749adda45b8eeb | |
parent | e6227a0326b503172fa9f95e0544a099ec85e05d (diff) | |
download | numpy-886da7998d0cdc115cd010671a904390a7275810.tar.gz |
ENH: restore unpack bit lookup table
Restores the good performance of unpackbits we had since 1.13.
Added a second lookup table for the new little bitorder flag and changed
it to be stored in little endian as it is the more common endian.
-rw-r--r-- | benchmarks/benchmarks/bench_core.py | 6 | ||||
-rw-r--r-- | numpy/core/src/multiarray/compiled_base.c | 96 |
2 files changed, 83 insertions, 19 deletions
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py index 194ce3218..9e409dd91 100644 --- a/benchmarks/benchmarks/bench_core.py +++ b/benchmarks/benchmarks/bench_core.py @@ -162,12 +162,18 @@ class UnpackBits(Benchmark): def time_unpackbits(self): np.unpackbits(self.d) + def time_unpackbits_little(self): + np.unpackbits(self.d, bitorder="little") + def time_unpackbits_axis0(self): np.unpackbits(self.d2, axis=0) def time_unpackbits_axis1(self): np.unpackbits(self.d2, axis=1) + def time_unpackbits_axis1_little(self): + np.unpackbits(self.d2, bitorder="little", axis=1) + class Indices(Benchmark): def time_indices(self): diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c index 25dc6951c..b1c457cb4 100644 --- a/numpy/core/src/multiarray/compiled_base.c +++ b/numpy/core/src/multiarray/compiled_base.c @@ -1703,6 +1703,9 @@ fail: static PyObject * unpack_bits(PyObject *input, int axis, PyObject *count_obj, char order) { + static int unpack_init = 0; + static npy_uint64 unpack_lookup_l[256]; + static npy_uint64 unpack_lookup_b[256]; PyArrayObject *inp; PyArrayObject *new = NULL; PyArrayObject *out = NULL; @@ -1788,6 +1791,41 @@ unpack_bits(PyObject *input, int axis, PyObject *count_obj, char order) goto fail; } + /* + * setup lookup table under GIL, 256 64 bit integers + * one integer represents 8 bits expanded to 0/1 bytes + */ + if (unpack_init == 0) { + npy_uint64 j; + for (j=0; j < 256; j++) { + npy_uint64 v_b = 0; + npy_uint64 v_l; + v_b |= (npy_uint64)((j & 1) == 1) << 56; + v_b |= (npy_uint64)((j & 2) == 2) << 48; + v_b |= (npy_uint64)((j & 4) == 4) << 40; + v_b |= (npy_uint64)((j & 8) == 8) << 32; + v_b |= (npy_uint64)((j & 16) == 16) << 24; + v_b |= (npy_uint64)((j & 32) == 32) << 16; + v_b |= (npy_uint64)((j & 64) == 64) << 8; + v_b |= (npy_uint64)((j & 128) == 128); + + /* for bitorder little the lookup table is just byte swapped */ + v_l = npy_bswap8(v_b); + +#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN + /* + * the byte pattern must be fixed on all platforms so bigendian has + * to be swapped + */ + v_l = npy_bswap8(v_l); + v_b = npy_bswap8(v_b); +#endif + unpack_lookup_b[j] = v_b; + unpack_lookup_l[j] = v_l; + } + unpack_init = 1; + } + count = PyArray_DIM(new, axis) * 8; if (outdims[axis] > count) { in_n = count / 8; @@ -1810,39 +1848,59 @@ unpack_bits(PyObject *input, int axis, PyObject *count_obj, char order) unsigned const char *inptr = PyArray_ITER_DATA(it); char *outptr = PyArray_ITER_DATA(ot); - if (order == 'b') { + if (out_stride == 1) { + const npy_uint64 * const unpack_lookup = (order == 'b') ? + unpack_lookup_b : unpack_lookup_l; + /* for unity stride we can just copy out of the lookup table */ for (index = 0; index < in_n; index++) { - for (i = 0; i < 8; i++) { - *outptr = ((*inptr & (128 >> i)) != 0); - outptr += out_stride; - } + memcpy(outptr, &unpack_lookup[*inptr], 8); + outptr += 8; inptr += in_stride; } /* Clean up the tail portion */ - for (i = 0; i < in_tail; i++) { - *outptr = ((*inptr & (128 >> i)) != 0); - outptr += out_stride; + if (in_tail) { + memcpy(outptr, &unpack_lookup[*inptr], in_tail); + } + /* Add padding */ + else if (out_pad) { + memset(outptr, 0, out_pad); } } else { - for (index = 0; index < in_n; index++) { - for (i = 0; i < 8; i++) { + if (order == 'b') { + for (index = 0; index < in_n; index++) { + for (i = 0; i < 8; i++) { + *outptr = ((*inptr & (128 >> i)) != 0); + outptr += out_stride; + } + inptr += in_stride; + } + /* Clean up the tail portion */ + for (i = 0; i < in_tail; i++) { + *outptr = ((*inptr & (128 >> i)) != 0); + outptr += out_stride; + } + } + else { + for (index = 0; index < in_n; index++) { + for (i = 0; i < 8; i++) { + *outptr = ((*inptr & (1 << i)) != 0); + outptr += out_stride; + } + inptr += in_stride; + } + /* Clean up the tail portion */ + for (i = 0; i < in_tail; i++) { *outptr = ((*inptr & (1 << i)) != 0); outptr += out_stride; } - inptr += in_stride; } - /* Clean up the tail portion */ - for (i = 0; i < in_tail; i++) { - *outptr = ((*inptr & (1 << i)) != 0); + /* Add padding */ + for (index = 0; index < out_pad; index++) { + *outptr = 0; outptr += out_stride; } } - /* Add padding */ - for (index = 0; index < out_pad; index++) { - *outptr = 0; - outptr += out_stride; - } PyArray_ITER_NEXT(it); PyArray_ITER_NEXT(ot); |