diff options
Diffstat (limited to 'include/jemalloc/internal/bit_util.h')
-rw-r--r-- | include/jemalloc/internal/bit_util.h | 457 |
1 files changed, 320 insertions, 137 deletions
diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h index c045eb86..bac59140 100644 --- a/include/jemalloc/internal/bit_util.h +++ b/include/jemalloc/internal/bit_util.h @@ -3,144 +3,383 @@ #include "jemalloc/internal/assert.h" -#define BIT_UTIL_INLINE static inline - /* Sanity check. */ #if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \ || !defined(JEMALLOC_INTERNAL_FFS) # error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure #endif +/* + * Unlike the builtins and posix ffs functions, our ffs requires a non-zero + * input, and returns the position of the lowest bit set (as opposed to the + * posix versions, which return 1 larger than that position and use a return + * value of zero as a sentinel. This tends to simplify logic in callers, and + * allows for consistency with the builtins we build fls on top of. + */ +static inline unsigned +ffs_llu(unsigned long long x) { + util_assume(x != 0); + return JEMALLOC_INTERNAL_FFSLL(x) - 1; +} -BIT_UTIL_INLINE unsigned -ffs_llu(unsigned long long bitmap) { - return JEMALLOC_INTERNAL_FFSLL(bitmap); +static inline unsigned +ffs_lu(unsigned long x) { + util_assume(x != 0); + return JEMALLOC_INTERNAL_FFSL(x) - 1; } -BIT_UTIL_INLINE unsigned -ffs_lu(unsigned long bitmap) { - return JEMALLOC_INTERNAL_FFSL(bitmap); +static inline unsigned +ffs_u(unsigned x) { + util_assume(x != 0); + return JEMALLOC_INTERNAL_FFS(x) - 1; } -BIT_UTIL_INLINE unsigned -ffs_u(unsigned bitmap) { - return JEMALLOC_INTERNAL_FFS(bitmap); +#define DO_FLS_SLOW(x, suffix) do { \ + util_assume(x != 0); \ + x |= (x >> 1); \ + x |= (x >> 2); \ + x |= (x >> 4); \ + x |= (x >> 8); \ + x |= (x >> 16); \ + if (sizeof(x) > 4) { \ + /* \ + * If sizeof(x) is 4, then the expression "x >> 32" \ + * will generate compiler warnings even if the code \ + * never executes. This circumvents the warning, and \ + * gets compiled out in optimized builds. \ + */ \ + int constant_32 = sizeof(x) * 4; \ + x |= (x >> constant_32); \ + } \ + x++; \ + if (x == 0) { \ + return 8 * sizeof(x) - 1; \ + } \ + return ffs_##suffix(x) - 1; \ +} while(0) + +static inline unsigned +fls_llu_slow(unsigned long long x) { + DO_FLS_SLOW(x, llu); } -#ifdef JEMALLOC_INTERNAL_POPCOUNTL -BIT_UTIL_INLINE unsigned +static inline unsigned +fls_lu_slow(unsigned long x) { + DO_FLS_SLOW(x, lu); +} + +static inline unsigned +fls_u_slow(unsigned x) { + DO_FLS_SLOW(x, u); +} + +#undef DO_FLS_SLOW + +#ifdef JEMALLOC_HAVE_BUILTIN_CLZ +static inline unsigned +fls_llu(unsigned long long x) { + util_assume(x != 0); + /* + * Note that the xor here is more naturally written as subtraction; the + * last bit set is the number of bits in the type minus the number of + * leading zero bits. But GCC implements that as: + * bsr edi, edi + * mov eax, 31 + * xor edi, 31 + * sub eax, edi + * If we write it as xor instead, then we get + * bsr eax, edi + * as desired. + */ + return (8 * sizeof(x) - 1) ^ __builtin_clzll(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + util_assume(x != 0); + return (8 * sizeof(x) - 1) ^ __builtin_clzl(x); +} + +static inline unsigned +fls_u(unsigned x) { + util_assume(x != 0); + return (8 * sizeof(x) - 1) ^ __builtin_clz(x); +} +#elif defined(_MSC_VER) + +#if LG_SIZEOF_PTR == 3 +#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x) +#else +/* + * This never actually runs; we're just dodging a compiler error for the + * never-taken branch where sizeof(void *) == 8. + */ +#define DO_BSR64(bit, x) bit = 0; unreachable() +#endif + +#define DO_FLS(x) do { \ + if (x == 0) { \ + return 8 * sizeof(x); \ + } \ + unsigned long bit; \ + if (sizeof(x) == 4) { \ + _BitScanReverse(&bit, (unsigned)x); \ + return (unsigned)bit; \ + } \ + if (sizeof(x) == 8 && sizeof(void *) == 8) { \ + DO_BSR64(bit, x); \ + return (unsigned)bit; \ + } \ + if (sizeof(x) == 8 && sizeof(void *) == 4) { \ + /* Dodge a compiler warning, as above. */ \ + int constant_32 = sizeof(x) * 4; \ + if (_BitScanReverse(&bit, \ + (unsigned)(x >> constant_32))) { \ + return 32 + (unsigned)bit; \ + } else { \ + _BitScanReverse(&bit, (unsigned)x); \ + return (unsigned)bit; \ + } \ + } \ + unreachable(); \ +} while (0) + +static inline unsigned +fls_llu(unsigned long long x) { + DO_FLS(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + DO_FLS(x); +} + +static inline unsigned +fls_u(unsigned x) { + DO_FLS(x); +} + +#undef DO_FLS +#undef DO_BSR64 +#else + +static inline unsigned +fls_llu(unsigned long long x) { + return fls_llu_slow(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + return fls_lu_slow(x); +} + +static inline unsigned +fls_u(unsigned x) { + return fls_u_slow(x); +} +#endif + +#if LG_SIZEOF_LONG_LONG > 3 +# error "Haven't implemented popcount for 16-byte ints." +#endif + +#define DO_POPCOUNT(x, type) do { \ + /* \ + * Algorithm from an old AMD optimization reference manual. \ + * We're putting a little bit more work than you might expect \ + * into the no-instrinsic case, since we only support the \ + * GCC intrinsics spelling of popcount (for now). Detecting \ + * whether or not the popcount builtin is actually useable in \ + * MSVC is nontrivial. \ + */ \ + \ + type bmul = (type)0x0101010101010101ULL; \ + \ + /* \ + * Replace each 2 bits with the sideways sum of the original \ + * values. 0x5 = 0b0101. \ + * \ + * You might expect this to be: \ + * x = (x & 0x55...) + ((x >> 1) & 0x55...). \ + * That costs an extra mask relative to this, though. \ + */ \ + x = x - ((x >> 1) & (0x55U * bmul)); \ + /* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\ + x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \ + /* \ + * Replace each 8 bits with their sideways sum. Note that we \ + * can't overflow within each 4-bit sum here, so we can skip \ + * the initial mask. \ + */ \ + x = (x + (x >> 4)) & (bmul * 0x0FU); \ + /* \ + * None of the partial sums in this multiplication (viewed in \ + * base-256) can overflow into the next digit. So the least \ + * significant byte of the product will be the least \ + * significant byte of the original value, the second least \ + * significant byte will be the sum of the two least \ + * significant bytes of the original value, and so on. \ + * Importantly, the high byte will be the byte-wise sum of all \ + * the bytes of the original value. \ + */ \ + x = x * bmul; \ + x >>= ((sizeof(x) - 1) * 8); \ + return (unsigned)x; \ +} while(0) + +static inline unsigned +popcount_u_slow(unsigned bitmap) { + DO_POPCOUNT(bitmap, unsigned); +} + +static inline unsigned +popcount_lu_slow(unsigned long bitmap) { + DO_POPCOUNT(bitmap, unsigned long); +} + +static inline unsigned +popcount_llu_slow(unsigned long long bitmap) { + DO_POPCOUNT(bitmap, unsigned long long); +} + +#undef DO_POPCOUNT + +static inline unsigned +popcount_u(unsigned bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNT + return JEMALLOC_INTERNAL_POPCOUNT(bitmap); +#else + return popcount_u_slow(bitmap); +#endif +} + +static inline unsigned popcount_lu(unsigned long bitmap) { - return JEMALLOC_INTERNAL_POPCOUNTL(bitmap); +#ifdef JEMALLOC_INTERNAL_POPCOUNTL + return JEMALLOC_INTERNAL_POPCOUNTL(bitmap); +#else + return popcount_lu_slow(bitmap); +#endif } + +static inline unsigned +popcount_llu(unsigned long long bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNTLL + return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap); +#else + return popcount_llu_slow(bitmap); #endif +} /* * Clears first unset bit in bitmap, and returns * place of bit. bitmap *must not* be 0. */ -BIT_UTIL_INLINE size_t +static inline size_t cfs_lu(unsigned long* bitmap) { - size_t bit = ffs_lu(*bitmap) - 1; + util_assume(*bitmap != 0); + size_t bit = ffs_lu(*bitmap); *bitmap ^= ZU(1) << bit; return bit; } -BIT_UTIL_INLINE unsigned -ffs_zu(size_t bitmap) { +static inline unsigned +ffs_zu(size_t x) { #if LG_SIZEOF_PTR == LG_SIZEOF_INT - return ffs_u(bitmap); + return ffs_u(x); #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG - return ffs_lu(bitmap); + return ffs_lu(x); #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG - return ffs_llu(bitmap); + return ffs_llu(x); #else #error No implementation for size_t ffs() #endif } -BIT_UTIL_INLINE unsigned -ffs_u64(uint64_t bitmap) { +static inline unsigned +fls_zu(size_t x) { +#if LG_SIZEOF_PTR == LG_SIZEOF_INT + return fls_u(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG + return fls_lu(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG + return fls_llu(x); +#else +#error No implementation for size_t fls() +#endif +} + + +static inline unsigned +ffs_u64(uint64_t x) { #if LG_SIZEOF_LONG == 3 - return ffs_lu(bitmap); + return ffs_lu(x); #elif LG_SIZEOF_LONG_LONG == 3 - return ffs_llu(bitmap); + return ffs_llu(x); #else #error No implementation for 64-bit ffs() #endif } -BIT_UTIL_INLINE unsigned -ffs_u32(uint32_t bitmap) { +static inline unsigned +fls_u64(uint64_t x) { +#if LG_SIZEOF_LONG == 3 + return fls_lu(x); +#elif LG_SIZEOF_LONG_LONG == 3 + return fls_llu(x); +#else +#error No implementation for 64-bit fls() +#endif +} + +static inline unsigned +ffs_u32(uint32_t x) { #if LG_SIZEOF_INT == 2 - return ffs_u(bitmap); + return ffs_u(x); #else #error No implementation for 32-bit ffs() #endif - return ffs_u(bitmap); + return ffs_u(x); +} + +static inline unsigned +fls_u32(uint32_t x) { +#if LG_SIZEOF_INT == 2 + return fls_u(x); +#else +#error No implementation for 32-bit fls() +#endif + return fls_u(x); } -BIT_UTIL_INLINE uint64_t +static inline uint64_t pow2_ceil_u64(uint64_t x) { -#if (defined(__amd64__) || defined(__x86_64__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) - if(unlikely(x <= 1)) { + if (unlikely(x <= 1)) { return x; } - size_t msb_on_index; -#if (defined(__amd64__) || defined(__x86_64__)) - asm ("bsrq %1, %0" - : "=r"(msb_on_index) // Outputs. - : "r"(x-1) // Inputs. - ); -#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ)) - msb_on_index = (63 ^ __builtin_clzll(x - 1)); -#endif + size_t msb_on_index = fls_u64(x - 1); + /* + * Range-check; it's on the callers to ensure that the result of this + * call won't overflow. + */ assert(msb_on_index < 63); return 1ULL << (msb_on_index + 1); -#else - x--; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - x |= x >> 32; - x++; - return x; -#endif } -BIT_UTIL_INLINE uint32_t +static inline uint32_t pow2_ceil_u32(uint32_t x) { -#if ((defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) && (!defined(__s390__))) - if(unlikely(x <= 1)) { - return x; + if (unlikely(x <= 1)) { + return x; } - size_t msb_on_index; -#if (defined(__i386__)) - asm ("bsr %1, %0" - : "=r"(msb_on_index) // Outputs. - : "r"(x-1) // Inputs. - ); -#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ)) - msb_on_index = (31 ^ __builtin_clz(x - 1)); -#endif + size_t msb_on_index = fls_u32(x - 1); + /* As above. */ assert(msb_on_index < 31); return 1U << (msb_on_index + 1); -#else - x--; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - x++; - return x; -#endif } /* Compute the smallest power of 2 that is >= x. */ -BIT_UTIL_INLINE size_t +static inline size_t pow2_ceil_zu(size_t x) { #if (LG_SIZEOF_PTR == 3) return pow2_ceil_u64(x); @@ -149,77 +388,21 @@ pow2_ceil_zu(size_t x) { #endif } -#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__)) -BIT_UTIL_INLINE unsigned -lg_floor(size_t x) { - size_t ret; - assert(x != 0); - - asm ("bsr %1, %0" - : "=r"(ret) // Outputs. - : "r"(x) // Inputs. - ); - assert(ret < UINT_MAX); - return (unsigned)ret; -} -#elif (defined(_MSC_VER)) -BIT_UTIL_INLINE unsigned +static inline unsigned lg_floor(size_t x) { - unsigned long ret; - - assert(x != 0); - + util_assume(x != 0); #if (LG_SIZEOF_PTR == 3) - _BitScanReverse64(&ret, x); -#elif (LG_SIZEOF_PTR == 2) - _BitScanReverse(&ret, x); + return fls_u64(x); #else -# error "Unsupported type size for lg_floor()" + return fls_u32(x); #endif - assert(ret < UINT_MAX); - return (unsigned)ret; } -#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ)) -BIT_UTIL_INLINE unsigned -lg_floor(size_t x) { - assert(x != 0); -#if (LG_SIZEOF_PTR == LG_SIZEOF_INT) - return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x); -#elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG) - return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clzl(x); -#else -# error "Unsupported type size for lg_floor()" -#endif -} -#else -BIT_UTIL_INLINE unsigned -lg_floor(size_t x) { - assert(x != 0); - - x |= (x >> 1); - x |= (x >> 2); - x |= (x >> 4); - x |= (x >> 8); - x |= (x >> 16); -#if (LG_SIZEOF_PTR == 3) - x |= (x >> 32); -#endif - if (x == SIZE_T_MAX) { - return (8 << LG_SIZEOF_PTR) - 1; - } - x++; - return ffs_zu(x) - 2; -} -#endif - -BIT_UTIL_INLINE unsigned +static inline unsigned lg_ceil(size_t x) { return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1); } -#undef BIT_UTIL_INLINE - /* A compile-time version of lg_floor and lg_ceil. */ #define LG_FLOOR_1(x) 0 #define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1)) |