/** * This header implements `npyv_divisor_*` intrinsics used for computing the parameters * of fast integer division, while division intrinsics `npyv_divc_*` are defined in * {extension}/arithmetic.h. */ #ifndef NPY_SIMD #error "Not a standalone header, use simd/simd.h instead" #endif #ifndef _NPY_SIMD_INTDIV_H #define _NPY_SIMD_INTDIV_H /********************************************************************************** ** Integer division ********************************************************************************** * Almost all architecture (except Power10) doesn't support integer vector division, * also the cost of scalar division in architectures like x86 is too high it can take * 30 to 40 cycles on modern chips and up to 100 on old ones. * * Therefore we are using division by multiplying with precomputed reciprocal technique, * the method that been used in this implementation is based on T. Granlund and P. L. Montgomery * “Division by invariant integers using multiplication(see [Figure 4.1, 5.1] * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556) * * It shows a good impact for all architectures especially on X86, * however computing divisor parameters is kind of expensive so this implementation * should only works when divisor is a scalar and used multiple of times. * * The division process is separated into two intrinsics for each data type * * 1- npyv_{dtype}x3 npyv_divisor_{dtype} ({dtype} divisor); * For computing the divisor parameters (multiplier + shifters + sign of divisor(signed only)) * * 2- npyv_{dtype} npyv_divisor_{dtype} (npyv_{dtype} dividend, npyv_{dtype}x3 divisor_parms); * For performing the final division. * ** For example: * int vstep = npyv_nlanes_s32; // number of lanes * int x = 0x6e70; * npyv_s32x3 divisor = npyv_divisor_s32(x); // init divisor params * for (; len >= vstep; src += vstep, dst += vstep, len -= vstep) { * npyv_s32 a = npyv_load_s32(*src); // load s32 vector from memory * a = npyv_divc_s32(a, divisor); // divide all elements by x * npyv_store_s32(dst, a); // store s32 vector into memory * } * ** NOTES: * - For 64-bit division on Aarch64 and IBM/Power, we fall-back to the scalar division * since emulating multiply-high is expensive and both architectures have very fast dividers. * *************************************************************** ** Figure 4.1: Unsigned division by run–time invariant divisor *************************************************************** * Initialization (given uword d with 1 ≤ d < 2^N): * int l = ceil(log2(d)); * uword m = 2^N * (2^l− d) / d + 1; * int sh1 = min(l, 1); * int sh2 = max(l − 1, 0); * * For q = FLOOR(a/d), all uword: * uword t1 = MULUH(m, a); * q = SRL(t1 + SRL(a − t1, sh1), sh2); * ************************************************************************************ ** Figure 5.1: Signed division by run–time invariant divisor, rounded towards zero ************************************************************************************ * Initialization (given constant sword d with d !=0): * int l = max(ceil(log2(abs(d))), 1); * udword m0 = 1 + (2^(N+l-1)) / abs(d); * sword m = m0 − 2^N; * sword dsign = XSIGN(d); * int sh = l − 1; * * For q = TRUNC(a/d), all sword: * sword q0 = a + MULSH(m, a); * q0 = SRA(q0, sh) − XSIGN(a); * q = EOR(q0, dsign) − dsign; */ /** * bit-scan reverse for non-zeros. returns the index of the highest set bit. * equivalent to floor(log2(a)) */ #ifdef _MSC_VER #include // _BitScanReverse #endif NPY_FINLINE unsigned npyv__bitscan_revnz_u32(npy_uint32 a) { assert(a > 0); // due to use __builtin_clz unsigned r; #if defined(NPY_HAVE_SSE2) && defined(_MSC_VER) unsigned long rl; (void)_BitScanReverse(&rl, (unsigned long)a); r = (unsigned)rl; #elif defined(NPY_HAVE_SSE2) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) \ && (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)) __asm__("bsr %1, %0" : "=r" (r) : "r"(a)); #elif defined(__GNUC__) || defined(__clang__) r = 31 - __builtin_clz(a); // performs on arm -> clz, ppc -> cntlzw #else r = 0; while (a >>= 1) { r++; } #endif return r; } NPY_FINLINE unsigned npyv__bitscan_revnz_u64(npy_uint64 a) { assert(a > 0); // due to use __builtin_clzll #if defined(_M_AMD64) && defined(_MSC_VER) unsigned long rl; (void)_BitScanReverse64(&rl, a); return (unsigned)rl; #elif defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) npy_uint64 r; __asm__("bsrq %1, %0" : "=r"(r) : "r"(a)); return (unsigned)r; #elif defined(__GNUC__) || defined(__clang__) return 63 - __builtin_clzll(a); #else npy_uint64 a_hi = a >> 32; if (a_hi == 0) { return npyv__bitscan_revnz_u32((npy_uint32)a); } return 32 + npyv__bitscan_revnz_u32((npy_uint32)a_hi); #endif } /** * Divides 128-bit unsigned integer by a 64-bit when the lower * 64-bit of the dividend is zero. * * This function is needed to calculate the multiplier of 64-bit integer division * see npyv_divisor_u64/npyv_divisor_s64. */ NPY_FINLINE npy_uint64 npyv__divh128_u64(npy_uint64 high, npy_uint64 divisor) { assert(divisor > 1); npy_uint64 quotient; #if defined(_M_X64) && defined(_MSC_VER) && _MSC_VER >= 1920 && !defined(__clang__) npy_uint64 remainder; quotient = _udiv128(high, 0, divisor, &remainder); (void)remainder; #elif defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) __asm__("divq %[d]" : "=a"(quotient) : [d] "r"(divisor), "a"(0), "d"(high)); #elif defined(__SIZEOF_INT128__) quotient = (npy_uint64)((((__uint128_t)high) << 64) / divisor); #else /** * Minified version based on Donald Knuth’s Algorithm D (Division of nonnegative integers), * and Generic implementation in Hacker’s Delight. * * See https://skanthak.homepage.t-online.de/division.html * with respect to the license of the Hacker's Delight book * (https://web.archive.org/web/20190408122508/http://www.hackersdelight.org/permissions.htm) */ // shift amount for normalize unsigned ldz = 63 - npyv__bitscan_revnz_u64(divisor); // normalize divisor divisor <<= ldz; high <<= ldz; // break divisor up into two 32-bit digits npy_uint32 divisor_hi = divisor >> 32; npy_uint32 divisor_lo = divisor & 0xFFFFFFFF; // compute high quotient digit npy_uint64 quotient_hi = high / divisor_hi; npy_uint64 remainder = high - divisor_hi * quotient_hi; npy_uint64 base32 = 1ULL << 32; while (quotient_hi >= base32 || quotient_hi*divisor_lo > base32*remainder) { --quotient_hi; remainder += divisor_hi; if (remainder >= base32) { break; } } // compute dividend digit pairs npy_uint64 dividend_pairs = base32*high - divisor*quotient_hi; // compute second quotient digit for lower zeros npy_uint32 quotient_lo = (npy_uint32)(dividend_pairs / divisor_hi); quotient = base32*quotient_hi + quotient_lo; #endif return quotient; } // Initializing divisor parameters for unsigned 8-bit division NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d) { unsigned l, l2, sh1, sh2, m; switch (d) { case 0: // LCOV_EXCL_LINE // for potential divide by zero, On x86 GCC inserts `ud2` instruction // instead of letting the HW/CPU trap it which leads to illegal instruction exception. // 'volatile' should suppress this behavior and allow us to raise HW/CPU // arithmetic exception. m = sh1 = sh2 = 1 / ((npy_uint8 volatile *)&d)[0]; break; case 1: m = 1; sh1 = sh2 = 0; break; case 2: m = 1; sh1 = 1; sh2 = 0; break; default: l = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d)) l2 = (npy_uint8)(1 << l); // 2^l, overflow to 0 if l = 8 m = ((npy_uint16)((l2 - d) << 8)) / d + 1; // multiplier sh1 = 1; sh2 = l - 1; // shift counts } npyv_u8x3 divisor; #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[0] = npyv_setall_u16(m); divisor.val[1] = npyv_set_u8(sh1); divisor.val[2] = npyv_set_u8(sh2); #elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[0] = npyv_setall_u8(m); divisor.val[1] = npyv_setall_u8(sh1); divisor.val[2] = npyv_setall_u8(sh2); #elif defined(NPY_HAVE_NEON) divisor.val[0] = npyv_setall_u8(m); divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1)); divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2)); #else #error "please initialize the shifting operand for the new architecture" #endif return divisor; } // Initializing divisor parameters for signed 8-bit division NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d); NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d) { #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 npyv_s16x3 p = npyv_divisor_s16(d); npyv_s8x3 r; r.val[0] = npyv_reinterpret_s8_s16(p.val[0]); r.val[1] = npyv_reinterpret_s8_s16(p.val[1]); r.val[2] = npyv_reinterpret_s8_s16(p.val[2]); return r; #else int d1 = abs(d); int sh, m; if (d1 > 1) { sh = (int)npyv__bitscan_revnz_u32(d1-1); // ceil(log2(abs(d))) - 1 m = (1 << (8 + sh)) / d1 + 1; // multiplier } else if (d1 == 1) { sh = 0; m = 1; } else { // raise arithmetic exception for d == 0 sh = m = 1 / ((npy_int8 volatile *)&d)[0]; // LCOV_EXCL_LINE } npyv_s8x3 divisor; divisor.val[0] = npyv_setall_s8(m); divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0); #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_s8(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s8(-sh); #else #error "please initialize the shifting operand for the new architecture" #endif return divisor; #endif } // Initializing divisor parameters for unsigned 16-bit division NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d) { unsigned l, l2, sh1, sh2, m; switch (d) { case 0: // LCOV_EXCL_LINE // raise arithmetic exception for d == 0 m = sh1 = sh2 = 1 / ((npy_uint16 volatile *)&d)[0]; break; case 1: m = 1; sh1 = sh2 = 0; break; case 2: m = 1; sh1 = 1; sh2 = 0; break; default: l = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d)) l2 = (npy_uint16)(1 << l); // 2^l, overflow to 0 if l = 16 m = ((l2 - d) << 16) / d + 1; // multiplier sh1 = 1; sh2 = l - 1; // shift counts } npyv_u16x3 divisor; divisor.val[0] = npyv_setall_u16(m); #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_u16(sh1); divisor.val[2] = npyv_set_u16(sh2); #elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_u16(sh1); divisor.val[2] = npyv_setall_u16(sh2); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh1)); divisor.val[2] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh2)); #else #error "please initialize the shifting operand for the new architecture" #endif return divisor; } // Initializing divisor parameters for signed 16-bit division NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d) { int d1 = abs(d); int sh, m; if (d1 > 1) { sh = (int)npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1 m = (1 << (16 + sh)) / d1 + 1; // multiplier } else if (d1 == 1) { sh = 0; m = 1; } else { // raise arithmetic exception for d == 0 sh = m = 1 / ((npy_int16 volatile *)&d)[0]; // LCOV_EXCL_LINE } npyv_s16x3 divisor; divisor.val[0] = npyv_setall_s16(m); divisor.val[2] = npyv_setall_s16(d < 0 ? -1 : 0); // sign of divisor #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_s16(sh); #elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_s16(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s16(-sh); #else #error "please initialize the shifting operand for the new architecture" #endif return divisor; } // Initializing divisor parameters for unsigned 32-bit division NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d) { npy_uint32 l, l2, sh1, sh2, m; switch (d) { case 0: // LCOV_EXCL_LINE // raise arithmetic exception for d == 0 m = sh1 = sh2 = 1 / ((npy_uint32 volatile *)&d)[0]; // LCOV_EXCL_LINE break; case 1: m = 1; sh1 = sh2 = 0; break; case 2: m = 1; sh1 = 1; sh2 = 0; break; default: l = npyv__bitscan_revnz_u32(d - 1) + 1; // ceil(log2(d)) l2 = (npy_uint32)(1ULL << l); // 2^l, overflow to 0 if l = 32 m = ((npy_uint64)(l2 - d) << 32) / d + 1; // multiplier sh1 = 1; sh2 = l - 1; // shift counts } npyv_u32x3 divisor; divisor.val[0] = npyv_setall_u32(m); #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_u32(sh1); divisor.val[2] = npyv_set_u32(sh2); #elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_u32(sh1); divisor.val[2] = npyv_setall_u32(sh2); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh1)); divisor.val[2] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh2)); #else #error "please initialize the shifting operand for the new architecture" #endif return divisor; } // Initializing divisor parameters for signed 32-bit division NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d) { npy_int32 d1 = abs(d); npy_int32 sh, m; // Handel abs overflow if ((npy_uint32)d == 0x80000000U) { m = 0x80000001; sh = 30; } else if (d1 > 1) { sh = npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1 m = (1ULL << (32 + sh)) / d1 + 1; // multiplier } else if (d1 == 1) { sh = 0; m = 1; } else { // raise arithmetic exception for d == 0 sh = m = 1 / ((npy_int32 volatile *)&d)[0]; // LCOV_EXCL_LINE } npyv_s32x3 divisor; divisor.val[0] = npyv_setall_s32(m); divisor.val[2] = npyv_setall_s32(d < 0 ? -1 : 0); // sign of divisor #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_s32(sh); #elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_s32(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s32(-sh); #else #error "please initialize the shifting operand for the new architecture" #endif return divisor; } // Initializing divisor parameters for unsigned 64-bit division NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d) { npyv_u64x3 divisor; #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON) divisor.val[0] = npyv_setall_u64(d); #else npy_uint64 l, l2, sh1, sh2, m; switch (d) { case 0: // LCOV_EXCL_LINE // raise arithmetic exception for d == 0 m = sh1 = sh2 = 1 / ((npy_uint64 volatile *)&d)[0]; // LCOV_EXCL_LINE break; case 1: m = 1; sh1 = sh2 = 0; break; case 2: m = 1; sh1 = 1; sh2 = 0; break; default: l = npyv__bitscan_revnz_u64(d - 1) + 1; // ceil(log2(d)) l2 = l < 64 ? 1ULL << l : 0; // 2^l m = npyv__divh128_u64(l2 - d, d) + 1; // multiplier sh1 = 1; sh2 = l - 1; // shift counts } divisor.val[0] = npyv_setall_u64(m); #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_u64(sh1); divisor.val[2] = npyv_set_u64(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif #endif return divisor; } // Initializing divisor parameters for signed 64-bit division NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d) { npyv_s64x3 divisor; #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON) divisor.val[0] = npyv_setall_s64(d); divisor.val[1] = npyv_cvt_s64_b64( npyv_cmpeq_s64(npyv_setall_s64(-1), divisor.val[0]) ); #else npy_int64 d1 = llabs(d); npy_int64 sh, m; // Handel abs overflow if ((npy_uint64)d == 0x8000000000000000ULL) { m = 0x8000000000000001LL; sh = 62; } else if (d1 > 1) { sh = npyv__bitscan_revnz_u64(d1 - 1); // ceil(log2(abs(d))) - 1 m = npyv__divh128_u64(1ULL << sh, d1) + 1; // multiplier } else if (d1 == 1) { sh = 0; m = 1; } else { // raise arithmetic exception for d == 0 sh = m = 1 / ((npy_int64 volatile *)&d)[0]; // LCOV_EXCL_LINE } divisor.val[0] = npyv_setall_s64(m); divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0); // sign of divisor #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_s64(sh); #else #error "please initialize the shifting operand for the new architecture" #endif #endif return divisor; } #endif // _NPY_SIMD_INTDIV_H