only the last accumulation into the 2**32-bit counters needs this adjustment to avoid overflow.

author: Qiyu8 <fangchunlin@huawei.com> 2020-12-16 17:30:19 +0800
committer: Qiyu8 <fangchunlin@huawei.com> 2020-12-16 17:30:19 +0800
commit: 4a09b2fe9bc32b33f5411ced65cbec2389e2167f (patch)
tree: 79605ca57193c18114e2bd530a9d26d34d436cb6
parent: e826c1f2616c72b023145fb4842bfbd1b60ddb3f (diff)
download: numpy-4a09b2fe9bc32b33f5411ced65cbec2389e2167f.tar.gz
1 files changed, 10 insertions, 9 deletions
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index abe7f4516..c71c4325a 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2138,14 +2138,14 @@ count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end)
     const npyv_u8 vone = npyv_setall_u8(1);
     const npyv_u8 vzero = npyv_zero_u8();
 
-    npy_intp n = 0;
+    npy_intp lane_max = 0;
     npyv_u8 vsum8 = npyv_zero_u8();
-    while (*d < end && n <= 0xFE) {
+    while (*d < end && lane_max <= 0xFE) {
         npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero));
         vt = npyv_and_u8(vt, vone);
         vsum8 = npyv_add_u8(vsum8, vt);
         *d += npyv_nlanes_u8;
-        n += npyv_nlanes_u8;
+        lane_max += 1;
     }
     return vsum8;
 }
@@ -2154,12 +2154,12 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u16
 count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end)
 {
     npyv_u16 vsum16 = npyv_zero_u16();
-    npy_intp n = 0;
-    while (*d < end && n <= 0xFF00) {
+    npy_intp lane_max = 0;
+    while (*d < end && lane_max <= 0xFF00-0xFF) {
         npyv_u8 vsum8 = count_zero_bytes_u8(d, end);
         npyv_u16x2 part = npyv_expand_u16_u8(vsum8);
         vsum16 = npyv_add_u16(vsum16, npyv_add_u16(part.val[0], part.val[1]));
-        n += 0xFF;
+        lane_max += 0xFF*2;
     }
     return vsum16;
 }
@@ -2168,12 +2168,13 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u32
 count_zero_bytes_u32(const npy_uint8 **d, const npy_uint8 *end)
 {
     npyv_u32 vsum32 = npyv_zero_u32();
-    npy_intp n = 0;
-    while (*d < end && n <= 0xFFFF0000) {
+    npy_intp lane_max = 0;
+    // The last accumulation needs to adjustment (2**32-1)/nlanes to avoid overflow.
+    while (*d < end && lane_max <= (0xFFFF0000-0xFFFF)/npyv_nlanes_u32) {
         npyv_u16 vsum16 = count_zero_bytes_u16(d, end);
         npyv_u32x2 part = npyv_expand_u32_u16(vsum16);
         vsum32 = npyv_add_u32(vsum32, npyv_add_u32(part.val[0], part.val[1]));
-        n += 0xFFFF;
+        lane_max += 0xFFFF*2;
     }
     return vsum32;
 }
author	Qiyu8 <fangchunlin@huawei.com>	2020-12-16 17:30:19 +0800
committer	Qiyu8 <fangchunlin@huawei.com>	2020-12-16 17:30:19 +0800
commit	4a09b2fe9bc32b33f5411ced65cbec2389e2167f (patch)
tree	79605ca57193c18114e2bd530a9d26d34d436cb6
parent	e826c1f2616c72b023145fb4842bfbd1b60ddb3f (diff)
download	numpy-4a09b2fe9bc32b33f5411ced65cbec2389e2167f.tar.gz