diff options
author | Sayed Adel <seiko@imavr.com> | 2021-11-13 03:19:18 +0200 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2022-01-19 07:54:40 +0200 |
commit | 47644b22624feda351457a6b0467e43ae5f9db62 (patch) | |
tree | 3425667fafd0a838b670e3f8f95b02a071e90078 | |
parent | 2946e025793820f33503f201d343a7a47e5cb8c2 (diff) | |
download | numpy-47644b22624feda351457a6b0467e43ae5f9db62.tar.gz |
SIMD: Add new universal intrinsics for lookup table
-rw-r--r-- | numpy/core/src/common/simd/avx2/memory.h | 21 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/memory.h | 29 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/memory.h | 40 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/memory.h | 41 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/memory.h | 37 |
5 files changed, 168 insertions, 0 deletions
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h index 5891a270a..410c35dc8 100644 --- a/numpy/core/src/common/simd/avx2/memory.h +++ b/numpy/core/src/common/simd/avx2/memory.h @@ -353,4 +353,25 @@ NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64) +/********************************* + * Lookup tables + *********************************/ +// uses vector as indexes into a table +// that contains 32 elements of float32. +NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) +{ return _mm256_i32gather_ps(table, idx, 4); } +NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) +{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); } +NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) +{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); } + +// uses vector as indexes into a table +// that contains 16 elements of float64. +NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) +{ return _mm256_i64gather_pd(table, idx, 8); } +NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) +{ return npyv_reinterpret_u64_f64(npyv_lut16_f64((const double*)table, idx)); } +NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) +{ return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); } + #endif // _NPY_SIMD_AVX2_MEMORY_H diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h index 47095bf72..dcfb6c890 100644 --- a/numpy/core/src/common/simd/avx512/memory.h +++ b/numpy/core/src/common/simd/avx512/memory.h @@ -329,4 +329,33 @@ NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64) +/************************************************** + * Lookup table + *************************************************/ +// uses vector as indexes into a table +// that contains 32 elements of float32. +NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) +{ + const npyv_f32 table0 = npyv_load_f32(table); + const npyv_f32 table1 = npyv_load_f32(table + 16); + return _mm512_permutex2var_ps(table0, idx, table1); +} +NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) +{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); } +NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) +{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); } + +// uses vector as indexes into a table +// that contains 16 elements of float64. +NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) +{ + const npyv_f64 table0 = npyv_load_f64(table); + const npyv_f64 table1 = npyv_load_f64(table + 8); + return _mm512_permutex2var_pd(table0, idx, table1); +} +NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) +{ return npyv_reinterpret_u64_f64(npyv_lut16_f64((const double*)table, idx)); } +NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) +{ return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); } + #endif // _NPY_SIMD_AVX512_MEMORY_H diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h index 1e258f1bc..7060ea628 100644 --- a/numpy/core/src/common/simd/neon/memory.h +++ b/numpy/core/src/common/simd/neon/memory.h @@ -332,5 +332,45 @@ NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64) #if NPY_SIMD_F64 NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64) #endif +/********************************* + * Lookup table + *********************************/ +// uses vector as indexes into a table +// that contains 32 elements of uint32. +NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) +{ + const unsigned i0 = vgetq_lane_u32(idx, 0); + const unsigned i1 = vgetq_lane_u32(idx, 1); + const unsigned i2 = vgetq_lane_u32(idx, 2); + const unsigned i3 = vgetq_lane_u32(idx, 3); + + uint32x2_t low = vcreate_u32(table[i0]); + low = vld1_lane_u32((const uint32_t*)table + i1, low, 1); + uint32x2_t high = vcreate_u32(table[i2]); + high = vld1_lane_u32((const uint32_t*)table + i3, high, 1); + return vcombine_u32(low, high); +} +NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) +{ return npyv_reinterpret_s32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } +NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) +{ return npyv_reinterpret_f32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } + +// uses vector as indexes into a table +// that contains 16 elements of uint64. +NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) +{ + const unsigned i0 = vgetq_lane_u32(vreinterpretq_u32_u64(idx), 0); + const unsigned i1 = vgetq_lane_u32(vreinterpretq_u32_u64(idx), 2); + return vcombine_u64( + vld1_u64((const uint64_t*)table + i0), + vld1_u64((const uint64_t*)table + i1) + ); +} +NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) +{ return npyv_reinterpret_s64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } +#if NPY_SIMD_F64 +NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) +{ return npyv_reinterpret_f64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } +#endif #endif // _NPY_SIMD_NEON_MEMORY_H diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h index 1074c3b02..3ff64848d 100644 --- a/numpy/core/src/common/simd/sse/memory.h +++ b/numpy/core/src/common/simd/sse/memory.h @@ -495,4 +495,45 @@ NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64) +/********************************* + * Lookup table + *********************************/ +// uses vector as indexes into a table +// that contains 32 elements of float32. +NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) +{ + const int i0 = _mm_cvtsi128_si32(idx); +#ifdef NPY_HAVE_SSE41 + const int i1 = _mm_extract_epi32(idx, 1); + const int i2 = _mm_extract_epi32(idx, 2); + const int i3 = _mm_extract_epi32(idx, 3); +#else + const int i1 = _mm_extract_epi16(idx, 2); + const int i2 = _mm_extract_epi16(idx, 4); + const int i3 = _mm_extract_epi16(idx, 6); +#endif + return npyv_set_f32(table[i0], table[i1], table[i2], table[i3]); +} +NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) +{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); } +NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) +{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); } + +// uses vector as indexes into a table +// that contains 16 elements of float64. +NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) +{ + const int i0 = _mm_cvtsi128_si32(idx); +#ifdef NPY_HAVE_SSE41 + const int i1 = _mm_extract_epi32(idx, 2); +#else + const int i1 = _mm_extract_epi16(idx, 4); +#endif + return npyv_set_f64(table[i0], table[i1]); +} +NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) +{ return npyv_reinterpret_u64_f64(npyv_lut16_f64((const double*)table, idx)); } +NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) +{ return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); } + #endif // _NPY_SIMD_SSE_MEMORY_H diff --git a/numpy/core/src/common/simd/vsx/memory.h b/numpy/core/src/common/simd/vsx/memory.h index 08a0a9276..3007584ef 100644 --- a/numpy/core/src/common/simd/vsx/memory.h +++ b/numpy/core/src/common/simd/vsx/memory.h @@ -343,4 +343,41 @@ NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f64, s64) +/********************************* + * Lookup table + *********************************/ +// uses vector as indexes into a table +// that contains 32 elements of float32. +NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) +{ + const unsigned i0 = vec_extract(idx, 0); + const unsigned i1 = vec_extract(idx, 1); + const unsigned i2 = vec_extract(idx, 2); + const unsigned i3 = vec_extract(idx, 3); + npyv_f32 r = vec_promote(table[i0], 0); + r = vec_insert(table[i1], r, 1); + r = vec_insert(table[i2], r, 2); + r = vec_insert(table[i3], r, 3); + return r; +} +NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) +{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); } +NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) +{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); } + +// uses vector as indexes into a table +// that contains 16 elements of float64. +NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) +{ + const unsigned i0 = vec_extract((npyv_u32)idx, 0); + const unsigned i1 = vec_extract((npyv_u32)idx, 2); + npyv_f64 r = vec_promote(table[i0], 0); + r = vec_insert(table[i1], r, 1); + return r; +} +NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) +{ return npyv_reinterpret_u64_f64(npyv_lut16_f64((const double*)table, idx)); } +NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) +{ return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); } + #endif // _NPY_SIMD_VSX_MEMORY_H |