summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulian Taylor <juliantaylor108@gmail.com>2019-05-12 18:35:18 +0200
committerJulian Taylor <juliantaylor108@gmail.com>2019-05-12 22:40:51 +0200
commit886da7998d0cdc115cd010671a904390a7275810 (patch)
tree90fe9cfb094deebd92c820e764749adda45b8eeb
parente6227a0326b503172fa9f95e0544a099ec85e05d (diff)
downloadnumpy-886da7998d0cdc115cd010671a904390a7275810.tar.gz
ENH: restore unpack bit lookup table
Restores the good performance of unpackbits we had since 1.13. Added a second lookup table for the new little bitorder flag and changed it to be stored in little endian as it is the more common endian.
-rw-r--r--benchmarks/benchmarks/bench_core.py6
-rw-r--r--numpy/core/src/multiarray/compiled_base.c96
2 files changed, 83 insertions, 19 deletions
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
index 194ce3218..9e409dd91 100644
--- a/benchmarks/benchmarks/bench_core.py
+++ b/benchmarks/benchmarks/bench_core.py
@@ -162,12 +162,18 @@ class UnpackBits(Benchmark):
def time_unpackbits(self):
np.unpackbits(self.d)
+ def time_unpackbits_little(self):
+ np.unpackbits(self.d, bitorder="little")
+
def time_unpackbits_axis0(self):
np.unpackbits(self.d2, axis=0)
def time_unpackbits_axis1(self):
np.unpackbits(self.d2, axis=1)
+ def time_unpackbits_axis1_little(self):
+ np.unpackbits(self.d2, bitorder="little", axis=1)
+
class Indices(Benchmark):
def time_indices(self):
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index 25dc6951c..b1c457cb4 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -1703,6 +1703,9 @@ fail:
static PyObject *
unpack_bits(PyObject *input, int axis, PyObject *count_obj, char order)
{
+ static int unpack_init = 0;
+ static npy_uint64 unpack_lookup_l[256];
+ static npy_uint64 unpack_lookup_b[256];
PyArrayObject *inp;
PyArrayObject *new = NULL;
PyArrayObject *out = NULL;
@@ -1788,6 +1791,41 @@ unpack_bits(PyObject *input, int axis, PyObject *count_obj, char order)
goto fail;
}
+ /*
+ * setup lookup table under GIL, 256 64 bit integers
+ * one integer represents 8 bits expanded to 0/1 bytes
+ */
+ if (unpack_init == 0) {
+ npy_uint64 j;
+ for (j=0; j < 256; j++) {
+ npy_uint64 v_b = 0;
+ npy_uint64 v_l;
+ v_b |= (npy_uint64)((j & 1) == 1) << 56;
+ v_b |= (npy_uint64)((j & 2) == 2) << 48;
+ v_b |= (npy_uint64)((j & 4) == 4) << 40;
+ v_b |= (npy_uint64)((j & 8) == 8) << 32;
+ v_b |= (npy_uint64)((j & 16) == 16) << 24;
+ v_b |= (npy_uint64)((j & 32) == 32) << 16;
+ v_b |= (npy_uint64)((j & 64) == 64) << 8;
+ v_b |= (npy_uint64)((j & 128) == 128);
+
+ /* for bitorder little the lookup table is just byte swapped */
+ v_l = npy_bswap8(v_b);
+
+#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN
+ /*
+ * the byte pattern must be fixed on all platforms so bigendian has
+ * to be swapped
+ */
+ v_l = npy_bswap8(v_l);
+ v_b = npy_bswap8(v_b);
+#endif
+ unpack_lookup_b[j] = v_b;
+ unpack_lookup_l[j] = v_l;
+ }
+ unpack_init = 1;
+ }
+
count = PyArray_DIM(new, axis) * 8;
if (outdims[axis] > count) {
in_n = count / 8;
@@ -1810,39 +1848,59 @@ unpack_bits(PyObject *input, int axis, PyObject *count_obj, char order)
unsigned const char *inptr = PyArray_ITER_DATA(it);
char *outptr = PyArray_ITER_DATA(ot);
- if (order == 'b') {
+ if (out_stride == 1) {
+ const npy_uint64 * const unpack_lookup = (order == 'b') ?
+ unpack_lookup_b : unpack_lookup_l;
+ /* for unity stride we can just copy out of the lookup table */
for (index = 0; index < in_n; index++) {
- for (i = 0; i < 8; i++) {
- *outptr = ((*inptr & (128 >> i)) != 0);
- outptr += out_stride;
- }
+ memcpy(outptr, &unpack_lookup[*inptr], 8);
+ outptr += 8;
inptr += in_stride;
}
/* Clean up the tail portion */
- for (i = 0; i < in_tail; i++) {
- *outptr = ((*inptr & (128 >> i)) != 0);
- outptr += out_stride;
+ if (in_tail) {
+ memcpy(outptr, &unpack_lookup[*inptr], in_tail);
+ }
+ /* Add padding */
+ else if (out_pad) {
+ memset(outptr, 0, out_pad);
}
}
else {
- for (index = 0; index < in_n; index++) {
- for (i = 0; i < 8; i++) {
+ if (order == 'b') {
+ for (index = 0; index < in_n; index++) {
+ for (i = 0; i < 8; i++) {
+ *outptr = ((*inptr & (128 >> i)) != 0);
+ outptr += out_stride;
+ }
+ inptr += in_stride;
+ }
+ /* Clean up the tail portion */
+ for (i = 0; i < in_tail; i++) {
+ *outptr = ((*inptr & (128 >> i)) != 0);
+ outptr += out_stride;
+ }
+ }
+ else {
+ for (index = 0; index < in_n; index++) {
+ for (i = 0; i < 8; i++) {
+ *outptr = ((*inptr & (1 << i)) != 0);
+ outptr += out_stride;
+ }
+ inptr += in_stride;
+ }
+ /* Clean up the tail portion */
+ for (i = 0; i < in_tail; i++) {
*outptr = ((*inptr & (1 << i)) != 0);
outptr += out_stride;
}
- inptr += in_stride;
}
- /* Clean up the tail portion */
- for (i = 0; i < in_tail; i++) {
- *outptr = ((*inptr & (1 << i)) != 0);
+ /* Add padding */
+ for (index = 0; index < out_pad; index++) {
+ *outptr = 0;
outptr += out_stride;
}
}
- /* Add padding */
- for (index = 0; index < out_pad; index++) {
- *outptr = 0;
- outptr += out_stride;
- }
PyArray_ITER_NEXT(it);
PyArray_ITER_NEXT(ot);