diff options
author | Qiyu8 <fangchunlin@huawei.com> | 2020-12-16 17:30:19 +0800 |
---|---|---|
committer | Qiyu8 <fangchunlin@huawei.com> | 2020-12-16 17:30:19 +0800 |
commit | 4a09b2fe9bc32b33f5411ced65cbec2389e2167f (patch) | |
tree | 79605ca57193c18114e2bd530a9d26d34d436cb6 | |
parent | e826c1f2616c72b023145fb4842bfbd1b60ddb3f (diff) | |
download | numpy-4a09b2fe9bc32b33f5411ced65cbec2389e2167f.tar.gz |
only the last accumulation into the 2**32-bit counters needs this adjustment to avoid overflow.
-rw-r--r-- | numpy/core/src/multiarray/item_selection.c | 19 |
1 files changed, 10 insertions, 9 deletions
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index abe7f4516..c71c4325a 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2138,14 +2138,14 @@ count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end) const npyv_u8 vone = npyv_setall_u8(1); const npyv_u8 vzero = npyv_zero_u8(); - npy_intp n = 0; + npy_intp lane_max = 0; npyv_u8 vsum8 = npyv_zero_u8(); - while (*d < end && n <= 0xFE) { + while (*d < end && lane_max <= 0xFE) { npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); vt = npyv_and_u8(vt, vone); vsum8 = npyv_add_u8(vsum8, vt); *d += npyv_nlanes_u8; - n += npyv_nlanes_u8; + lane_max += 1; } return vsum8; } @@ -2154,12 +2154,12 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u16 count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end) { npyv_u16 vsum16 = npyv_zero_u16(); - npy_intp n = 0; - while (*d < end && n <= 0xFF00) { + npy_intp lane_max = 0; + while (*d < end && lane_max <= 0xFF00-0xFF) { npyv_u8 vsum8 = count_zero_bytes_u8(d, end); npyv_u16x2 part = npyv_expand_u16_u8(vsum8); vsum16 = npyv_add_u16(vsum16, npyv_add_u16(part.val[0], part.val[1])); - n += 0xFF; + lane_max += 0xFF*2; } return vsum16; } @@ -2168,12 +2168,13 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u32 count_zero_bytes_u32(const npy_uint8 **d, const npy_uint8 *end) { npyv_u32 vsum32 = npyv_zero_u32(); - npy_intp n = 0; - while (*d < end && n <= 0xFFFF0000) { + npy_intp lane_max = 0; + // The last accumulation needs to adjustment (2**32-1)/nlanes to avoid overflow. + while (*d < end && lane_max <= (0xFFFF0000-0xFFFF)/npyv_nlanes_u32) { npyv_u16 vsum16 = count_zero_bytes_u16(d, end); npyv_u32x2 part = npyv_expand_u32_u16(vsum16); vsum32 = npyv_add_u32(vsum32, npyv_add_u32(part.val[0], part.val[1])); - n += 0xFFFF; + lane_max += 0xFFFF*2; } return vsum32; } |