diff options
-rw-r--r-- | benchmarks/benchmarks/bench_core.py | 3 | ||||
-rw-r--r-- | numpy/core/src/multiarray/compiled_base.c | 3 |
2 files changed, 5 insertions, 1 deletions
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py index 0c2a18c15..1c028542d 100644 --- a/benchmarks/benchmarks/bench_core.py +++ b/benchmarks/benchmarks/bench_core.py @@ -165,6 +165,9 @@ class PackBits(Benchmark): def time_packbits(self, dtype): np.packbits(self.d) + def time_packbits_little(self, dtype): + np.packbits(self.d, bitorder="little") + def time_packbits_axis0(self, dtype): np.packbits(self.d2, axis=0) diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c index f09a1de32..6ae4dda6b 100644 --- a/numpy/core/src/multiarray/compiled_base.c +++ b/numpy/core/src/multiarray/compiled_base.c @@ -1502,6 +1502,7 @@ pack_inner(const char *inptr, npy_intp vn_out = n_out - (remain ? 1 : 0); const int vstep = npyv_nlanes_u64; const int vstepx4 = vstep * 4; + const int isAligned = npy_is_aligned(outptr, sizeof(npy_uint64)); vn_out -= (vn_out & (vstep - 1)); for (; index <= vn_out - vstepx4; index += vstepx4, inptr += npyv_nlanes_u8 * 4) { npyv_u8 v0 = npyv_load_u8((const npy_uint8*)inptr); @@ -1520,7 +1521,7 @@ pack_inner(const char *inptr, bb[2] = npyv_tobits_b8(npyv_cmpneq_u8(v2, v_zero)); bb[3] = npyv_tobits_b8(npyv_cmpneq_u8(v3, v_zero)); if(out_stride == 1 && - (!NPY_STRONG_ALIGNMENT || npy_is_aligned(outptr, sizeof(npy_uint64)))) { + (!NPY_STRONG_ALIGNMENT || isAligned)) { npy_uint64 *ptr64 = (npy_uint64*)outptr; #if NPY_SIMD_WIDTH == 16 npy_uint64 bcomp = bb[0] | (bb[1] << 16) | (bb[2] << 32) | (bb[3] << 48); |