summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/src/common/simd/avx2/avx2.h2
-rw-r--r--numpy/core/src/common/simd/avx2/memory.h286
-rw-r--r--numpy/core/src/common/simd/avx512/avx512.h3
-rw-r--r--numpy/core/src/common/simd/avx512/memory.h238
-rw-r--r--numpy/core/src/common/simd/neon/memory.h287
-rw-r--r--numpy/core/src/common/simd/simd.h49
-rw-r--r--numpy/core/src/common/simd/sse/memory.h424
-rw-r--r--numpy/core/src/common/simd/vsx/memory.h432
8 files changed, 1603 insertions, 118 deletions
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index c99d628ee..0641f2314 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -5,6 +5,8 @@
#define NPY_SIMD 256
#define NPY_SIMD_WIDTH 32
#define NPY_SIMD_F64 1
+// Enough limit to allow us to use _mm256_i32gather_*
+#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
typedef __m256i npyv_u8;
typedef __m256i npyv_s8;
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 5ea7414fd..e27bf15fe 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -2,6 +2,8 @@
#error "Not a standalone header"
#endif
+#include "misc.h"
+
#ifndef _NPY_SIMD_AVX2_MEMORY_H
#define _NPY_SIMD_AVX2_MEMORY_H
@@ -66,5 +68,289 @@ NPYV_IMPL_AVX2_MEM_INT(npy_int64, s64)
// store higher part
#define npyv_storeh_f32(PTR, VEC) _mm_storeu_ps(PTR, _mm256_extractf128_ps(VEC, 1))
#define npyv_storeh_f64(PTR, VEC) _mm_storeu_pd(PTR, _mm256_extractf128_pd(VEC, 1))
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+ assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+ const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ const __m256i idx = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
+ return _mm256_i32gather_epi32((const int*)ptr, idx, 4);
+}
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
+//// 64
+#if 0 // slower
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{
+ const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+ return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
+}
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+#endif
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{
+ __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
+ __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
+ __m128d a01 = _mm_loadh_pd(a0, ptr + stride);
+ __m128d a23 = _mm_loadh_pd(a2, ptr + stride*3);
+ return _mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1);
+}
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{
+ __m128i a0 = _mm256_castsi256_si128(a);
+ __m128i a1 = _mm256_extracti128_si256(a, 1);
+ ptr[stride * 0] = _mm_cvtsi128_si32(a0);
+ ptr[stride * 1] = _mm_extract_epi32(a0, 1);
+ ptr[stride * 2] = _mm_extract_epi32(a0, 2);
+ ptr[stride * 3] = _mm_extract_epi32(a0, 3);
+ ptr[stride * 4] = _mm_cvtsi128_si32(a1);
+ ptr[stride * 5] = _mm_extract_epi32(a1, 1);
+ ptr[stride * 6] = _mm_extract_epi32(a1, 2);
+ ptr[stride * 7] = _mm_extract_epi32(a1, 3);
+}
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, _mm256_castps_si256(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{
+ __m128d a0 = _mm256_castpd256_pd128(a);
+ __m128d a1 = _mm256_extractf128_pd(a, 1);
+ _mm_storel_pd(ptr + stride * 0, a0);
+ _mm_storeh_pd(ptr + stride * 1, a0);
+ _mm_storel_pd(ptr + stride * 2, a1);
+ _mm_storeh_pd(ptr + stride * 3, a1);
+}
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+ const __m256i vfill = _mm256_set1_epi32(fill);
+ const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
+ __m256i payload = _mm256_maskload_epi32((const int*)ptr, mask);
+ return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
+ return _mm256_maskload_epi32((const int*)ptr, mask);
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+ const __m256i vfill = _mm256_set1_epi64x(fill);
+ const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+ __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
+ __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+ return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+ __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
+ return _mm256_maskload_epi64((const void*)ptr, mask);
+}
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+ assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+ const __m256i vfill = _mm256_set1_epi32(fill);
+ const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ const __m256i idx = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
+ __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
+ return _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+ const __m256i vfill = _mm256_set1_epi64x(fill);
+ const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+ const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+ __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
+ return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64
+npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
+ _mm256_maskstore_epi32((int*)ptr, mask, a);
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+ __m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
+ _mm256_maskstore_epi64((void*)ptr, mask, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ __m128i a0 = _mm256_castsi256_si128(a);
+ __m128i a1 = _mm256_extracti128_si256(a, 1);
+ switch(nlane) {
+ default:
+ ptr[stride*7] = _mm_extract_epi32(a1, 3);
+ case 7:
+ ptr[stride*6] = _mm_extract_epi32(a1, 2);
+ case 6:
+ ptr[stride*5] = _mm_extract_epi32(a1, 1);
+ case 5:
+ ptr[stride*4] = _mm_extract_epi32(a1, 0);
+ case 4:
+ ptr[stride*3] = _mm_extract_epi32(a0, 3);
+ case 3:
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
+ case 2:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ case 1:
+ ptr[stride*0] = _mm_extract_epi32(a0, 0);
+ }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+ __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+ double *dptr = (double*)ptr;
+ switch(nlane) {
+ default:
+ _mm_storeh_pd(dptr + stride * 3, a1);
+ case 3:
+ _mm_storel_pd(dptr + stride * 2, a1);
+ case 2:
+ _mm_storeh_pd(dptr + stride * 1, a0);
+ case 1:
+ _mm_storel_pd(dptr + stride * 0, a0);
+ }
+}
+
+/*****************************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
+ *****************************************************************************/
+#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64)
#endif // _NPY_SIMD_AVX2_MEMORY_H
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 96fdf72b9..b09d772f2 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -4,6 +4,9 @@
#define NPY_SIMD 512
#define NPY_SIMD_WIDTH 64
#define NPY_SIMD_F64 1
+// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
+#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
+#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
typedef __m512i npyv_u8;
typedef __m512i npyv_s8;
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index e212c4555..bffd6e907 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -90,5 +90,243 @@ NPYV_IMPL_AVX512_MEM_INT(npy_int64, s64)
// store higher part
#define npyv_storeh_f32(PTR, VEC) _mm256_storeu_ps(PTR, npyv512_higher_ps256(VEC))
#define npyv_storeh_f64(PTR, VEC) _mm256_storeu_pd(PTR, npyv512_higher_pd256(VEC))
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+ assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+ const __m512i steps = npyv_set_s32(
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ );
+ const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+ return _mm512_i32gather_epi32(idx, (const __m512i*)ptr, 4);
+}
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return _mm512_castsi512_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
+//// 64
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{
+ const __m512i idx = _mm512_setr_epi64(
+ 0*stride, 1*stride, 2*stride, 3*stride,
+ 4*stride, 5*stride, 6*stride, 7*stride
+ );
+ return _mm512_i64gather_epi64(idx, (const __m512i*)ptr, 8);
+}
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ assert(llabs(stride) <= NPY_SIMD_MAXSTORE_STRIDE32);
+ const __m512i steps = _mm512_setr_epi32(
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ );
+ const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+ _mm512_i32scatter_epi32((__m512i*)ptr, idx, a, 4);
+}
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+ const __m512i idx = _mm512_setr_epi64(
+ 0*stride, 1*stride, 2*stride, 3*stride,
+ 4*stride, 5*stride, 6*stride, 7*stride
+ );
+ _mm512_i64scatter_epi64((__m512i*)ptr, idx, a, 8);
+}
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+ const __m512i vfill = _mm512_set1_epi32(fill);
+ const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+ const __m512i vfill = _mm512_set1_epi64(fill);
+ const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+ return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+}
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+ assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+ const __m512i steps = npyv_set_s32(
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ );
+ const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+ const __m512i vfill = _mm512_set1_epi32(fill);
+ const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+ const __m512i idx = _mm512_setr_epi64(
+ 0*stride, 1*stride, 2*stride, 3*stride,
+ 4*stride, 5*stride, 6*stride, 7*stride
+ );
+ const __m512i vfill = _mm512_set1_epi64(fill);
+ const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64
+npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_storeu_epi32((__m512i*)ptr, mask, a);
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ assert(llabs(stride) <= NPY_SIMD_MAXSTORE_STRIDE32);
+ const __m512i steps = _mm512_setr_epi32(
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ );
+ const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+ const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4);
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ const __m512i idx = _mm512_setr_epi64(
+ 0*stride, 1*stride, 2*stride, 3*stride,
+ 4*stride, 5*stride, 6*stride, 7*stride
+ );
+ const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
+}
+
+/*****************************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
+ *****************************************************************************/
+#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64)
#endif // _NPY_SIMD_AVX512_MEMORY_H
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
index afa703584..1e258f1bc 100644
--- a/numpy/core/src/common/simd/neon/memory.h
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -5,6 +5,8 @@
#ifndef _NPY_SIMD_NEON_MEMORY_H
#define _NPY_SIMD_NEON_MEMORY_H
+#include "misc.h"
+
/***************************
* load/store
***************************/
@@ -45,5 +47,290 @@ NPYV_IMPL_NEON_MEM(f32, float)
#if NPY_SIMD_F64
NPYV_IMPL_NEON_MEM(f64, double)
#endif
+/***************************
+ * Non-contiguous Load
+ ***************************/
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{
+ switch (stride) {
+ case 2:
+ return vld2q_s32((const int32_t*)ptr).val[0];
+ case 3:
+ return vld3q_s32((const int32_t*)ptr).val[0];
+ case 4:
+ return vld4q_s32((const int32_t*)ptr).val[0];
+ default:;
+ int32x2_t ax = vcreate_s32(*ptr);
+ int32x4_t a = vcombine_s32(ax, ax);
+ a = vld1q_lane_s32((const int32_t*)ptr + stride, a, 1);
+ a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2);
+ a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3);
+ return a;
+ }
+}
+
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+ return npyv_reinterpret_u32_s32(
+ npyv_loadn_s32((const npy_int32*)ptr, stride)
+ );
+}
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{
+ return npyv_reinterpret_f32_s32(
+ npyv_loadn_s32((const npy_int32*)ptr, stride)
+ );
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{
+ return vcombine_s64(
+ vld1_s64((const int64_t*)ptr), vld1_s64((const int64_t*)ptr + stride)
+ );
+}
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{
+ return npyv_reinterpret_u64_s64(
+ npyv_loadn_s64((const npy_int64*)ptr, stride)
+ );
+}
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{
+ return npyv_reinterpret_f64_s64(
+ npyv_loadn_s64((const npy_int64*)ptr, stride)
+ );
+}
+#endif
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{
+ vst1q_lane_s32((int32_t*)ptr, a, 0);
+ vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+ vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+ vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+}
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_u32(a)); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_f32(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{
+ vst1q_lane_s64((int64_t*)ptr, a, 0);
+ vst1q_lane_s64((int64_t*)ptr + stride, a, 1);
+}
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_u64(a)); }
+
+#if NPY_SIMD_F64
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); }
+#endif
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+ switch(nlane) {
+ case 1:
+ return vld1q_lane_s32((const int32_t*)ptr, vdupq_n_s32(fill), 0);
+ case 2:
+ return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(fill));
+ case 3:
+ return vcombine_s32(
+ vld1_s32((const int32_t*)ptr),
+ vld1_lane_s32((const int32_t*)ptr + 2, vdup_n_s32(fill), 0)
+ );
+ default:
+ return npyv_load_s32(ptr);
+ }
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s32(ptr, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill));
+ }
+ return npyv_load_s64(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s64(ptr, nlane, 0); }
+
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+ int32x4_t vfill = vdupq_n_s32(fill);
+ switch(nlane) {
+ case 3:
+ vfill = vld1q_lane_s32((const int32_t*)ptr + stride*2, vfill, 2);
+ case 2:
+ vfill = vld1q_lane_s32((const int32_t*)ptr + stride, vfill, 1);
+ case 1:
+ vfill = vld1q_lane_s32((const int32_t*)ptr, vfill, 0);
+ return vfill;
+ default:
+ return npyv_loadn_s32(ptr, stride);
+ }
+}
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill));
+ }
+ return npyv_loadn_s64(ptr, stride);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ switch(nlane) {
+ case 1:
+ vst1q_lane_s32((int32_t*)ptr, a, 0);
+ break;
+ case 2:
+ vst1_s32((int32_t*)ptr, vget_low_s32(a));
+ break;
+ case 3:
+ vst1_s32((int32_t*)ptr, vget_low_s32(a));
+ vst1q_lane_s32((int32_t*)ptr + 2, a, 2);
+ break;
+ default:
+ npyv_store_s32(ptr, a);
+ }
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ vst1q_lane_s64((int64_t*)ptr, a, 0);
+ return;
+ }
+ npyv_store_s64(ptr, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ switch(nlane) {
+ default:
+ vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+ case 3:
+ vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+ case 2:
+ vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+ case 1:
+ vst1q_lane_s32((int32_t*)ptr, a, 0);
+ break;
+ }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ vst1q_lane_s64((int64_t*)ptr, a, 0);
+ return;
+ }
+ npyv_storen_s64(ptr, stride, a);
+}
+
+/*****************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via casting
+ *****************************************************************/
+#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64)
+#if NPY_SIMD_F64
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64)
+#endif
#endif // _NPY_SIMD_NEON_MEMORY_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 2f39c8427..8804223c9 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -49,6 +49,55 @@ typedef double npyv_lanetype_f64;
#define NPY_SIMD_WIDTH 0
#define NPY_SIMD_F64 0
#endif
+/**
+ * Some SIMD extensions currently(AVX2, AVX512F) require (de facto)
+ * a maximum number of strides sizes when dealing with non-contiguous memory access.
+ *
+ * Therefore the following functions must be used to check the maximum
+ * acceptable limit of strides before using any of non-contiguous load/store intrinsics.
+ *
+ * For instance:
+ * npy_intp ld_stride = step[0] / sizeof(float);
+ * npy_intp st_stride = step[1] / sizeof(float);
+ *
+ * if (npyv_loadable_stride_f32(ld_stride) && npyv_storable_stride_f32(st_stride)) {
+ * for (;;)
+ * npyv_f32 a = npyv_loadn_f32(ld_pointer, ld_stride);
+ * // ...
+ * npyv_storen_f32(st_pointer, st_stride, a);
+ * }
+ * else {
+ * for (;;)
+ * // C scalars
+ * }
+ */
+#ifndef NPY_SIMD_MAXLOAD_STRIDE32
+ #define NPY_SIMD_MAXLOAD_STRIDE32 0
+#endif
+#ifndef NPY_SIMD_MAXSTORE_STRIDE32
+ #define NPY_SIMD_MAXSTORE_STRIDE32 0
+#endif
+#ifndef NPY_SIMD_MAXLOAD_STRIDE64
+ #define NPY_SIMD_MAXLOAD_STRIDE64 0
+#endif
+#ifndef NPY_SIMD_MAXSTORE_STRIDE64
+ #define NPY_SIMD_MAXSTORE_STRIDE64 0
+#endif
+#define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE) \
+ NPY_FINLINE int npyv_loadable_stride_##SFX(npy_intp stride) \
+ { return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1; } \
+ NPY_FINLINE int npyv_storable_stride_##SFX(npy_intp stride) \
+ { return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1; }
+#if NPY_SIMD
+ NPYV_IMPL_MAXSTRIDE(u32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
+ NPYV_IMPL_MAXSTRIDE(s32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
+ NPYV_IMPL_MAXSTRIDE(f32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
+ NPYV_IMPL_MAXSTRIDE(u64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64)
+ NPYV_IMPL_MAXSTRIDE(s64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64)
+#endif
+#if NPY_SIMD_F64
+ NPYV_IMPL_MAXSTRIDE(f64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64)
+#endif
#ifdef __cplusplus
}
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index 1a555d6f0..1074c3b02 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -5,6 +5,8 @@
#ifndef _NPY_SIMD_SSE_MEMORY_H
#define _NPY_SIMD_SSE_MEMORY_H
+#include "misc.h"
+
/***************************
* load/store
***************************/
@@ -70,5 +72,427 @@ NPYV_IMPL_SSE_MEM_INT(npy_int64, s64)
// store higher part
#define npyv_storeh_f32(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castps_si128(VEC))
#define npyv_storeh_f64(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castpd_si128(VEC))
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{
+ __m128i a = _mm_cvtsi32_si128(*ptr);
+#ifdef NPY_HAVE_SSE41
+ a = _mm_insert_epi32(a, ptr[stride], 1);
+ a = _mm_insert_epi32(a, ptr[stride*2], 2);
+ a = _mm_insert_epi32(a, ptr[stride*3], 3);
+#else
+ __m128i a1 = _mm_cvtsi32_si128(ptr[stride]);
+ __m128i a2 = _mm_cvtsi32_si128(ptr[stride*2]);
+ __m128i a3 = _mm_cvtsi32_si128(ptr[stride*3]);
+ a = _mm_unpacklo_epi32(a, a1);
+ a = _mm_unpacklo_epi64(a, _mm_unpacklo_epi32(a2, a3));
+#endif
+ return a;
+}
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return npyv_loadn_s32((const npy_int32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return _mm_castsi128_ps(npyv_loadn_s32((const npy_int32*)ptr, stride)); }
+//// 64
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return _mm_loadh_pd(npyv_loadl_f64(ptr), ptr + stride); }
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{
+ ptr[stride * 0] = _mm_cvtsi128_si32(a);
+#ifdef NPY_HAVE_SSE41
+ ptr[stride * 1] = _mm_extract_epi32(a, 1);
+ ptr[stride * 2] = _mm_extract_epi32(a, 2);
+ ptr[stride * 3] = _mm_extract_epi32(a, 3);
+#else
+ ptr[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+ ptr[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+ ptr[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+#endif
+}
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, _mm_castps_si128(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{
+ _mm_storel_pd(ptr, a);
+ _mm_storeh_pd(ptr + stride, a);
+}
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+#if defined(__clang__) && __clang_major__ > 7
+ /**
+ * Clang >=8 perform aggressive optimization that tends to
+ * zero the bits of upper half part of vectors even
+ * when we try to fill it up with certain scalars,
+ * which my lead to zero division errors.
+ */
+ #define NPYV__CLANG_ZEROUPPER
+#endif
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+ if (nlane > 3) {
+ return npyv_load_s32(ptr);
+ }
+ npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
+ for (npy_uint64 i = 0; i < nlane; ++i) {
+ data[i] = ptr[i];
+ }
+ return npyv_loada_s32(data);
+#else
+ #ifndef NPY_HAVE_SSE41
+ const short *wptr = (const short*)ptr;
+ #endif
+ const __m128i vfill = npyv_setall_s32(fill);
+ __m128i a;
+ switch(nlane) {
+ case 2:
+ return _mm_castpd_si128(
+ _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+ );
+ #ifdef NPY_HAVE_SSE41
+ case 1:
+ return _mm_insert_epi32(vfill, ptr[0], 0);
+ case 3:
+ a = _mm_loadl_epi64((const __m128i*)ptr);
+ a = _mm_insert_epi32(a, ptr[2], 2);
+ a = _mm_insert_epi32(a, fill, 3);
+ return a;
+ #else
+ case 1:
+ a = _mm_insert_epi16(vfill, wptr[0], 0);
+ return _mm_insert_epi16(a, wptr[1], 1);
+ case 3:
+ a = _mm_loadl_epi64((const __m128i*)ptr);
+ a = _mm_unpacklo_epi64(a, vfill);
+ a = _mm_insert_epi16(a, wptr[4], 4);
+ a = _mm_insert_epi16(a, wptr[5], 5);
+ return a;
+ #endif // NPY_HAVE_SSE41
+ default:
+ return npyv_load_s32(ptr);
+ }
+#endif
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ switch(nlane) {
+ case 1:
+ return _mm_cvtsi32_si128(*ptr);
+ case 2:
+ return _mm_loadl_epi64((const __m128i*)ptr);
+ case 3:;
+ npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+ #ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[2], 2);
+ #else
+ return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+ #endif
+ default:
+ return npyv_load_s32(ptr);
+ }
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+ if (nlane <= 2) {
+ npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
+ for (npy_uint64 i = 0; i < nlane; ++i) {
+ data[i] = ptr[i];
+ }
+ return npyv_loada_s64(data);
+ }
+#else
+ if (nlane == 1) {
+ const __m128i vfill = npyv_setall_s64(fill);
+ return _mm_castpd_si128(
+ _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+ );
+ }
+#endif
+ return npyv_load_s64(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return _mm_loadl_epi64((const __m128i*)ptr);
+ }
+ return npyv_load_s64(ptr);
+}
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+ if (nlane > 3) {
+ return npyv_loadn_s32(ptr, stride);
+ }
+ npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
+ for (npy_uint64 i = 0; i < nlane; ++i) {
+ data[i] = ptr[stride*i];
+ }
+ return npyv_loada_s32(data);
+#else
+ __m128i vfill = npyv_setall_s32(fill);
+ #ifndef NPY_HAVE_SSE41
+ const short *wptr = (const short*)ptr;
+ #endif
+ switch(nlane) {
+ #ifdef NPY_HAVE_SSE41
+ case 3:
+ vfill = _mm_insert_epi32(vfill, ptr[stride*2], 2);
+ case 2:
+ vfill = _mm_insert_epi32(vfill, ptr[stride], 1);
+ case 1:
+ vfill = _mm_insert_epi32(vfill, ptr[0], 0);
+ break;
+ #else
+ case 3:
+ vfill = _mm_unpacklo_epi32(_mm_cvtsi32_si128(ptr[stride*2]), vfill);
+ case 2:
+ vfill = _mm_unpacklo_epi64(_mm_unpacklo_epi32(
+ _mm_cvtsi32_si128(*ptr), _mm_cvtsi32_si128(ptr[stride])
+ ), vfill);
+ break;
+ case 1:
+ vfill = _mm_insert_epi16(vfill, wptr[0], 0);
+ vfill = _mm_insert_epi16(vfill, wptr[1], 1);
+ break;
+ #endif // NPY_HAVE_SSE41
+ default:
+ return npyv_loadn_s32(ptr, stride);
+ } // switch
+ return vfill;
+#endif
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ switch(nlane) {
+ case 1:
+ return _mm_cvtsi32_si128(ptr[0]);
+ case 2:;
+ npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+#ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[stride], 1);
+#else
+ return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+#endif // NPY_HAVE_SSE41
+ case 3:;
+ a = _mm_cvtsi32_si128(ptr[0]);
+#ifdef NPY_HAVE_SSE41
+ a = _mm_insert_epi32(a, ptr[stride], 1);
+ a = _mm_insert_epi32(a, ptr[stride*2], 2);
+ return a;
+#else
+ a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+ a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+ return a;
+#endif // NPY_HAVE_SSE41
+ default:
+ return npyv_loadn_s32(ptr, stride);
+ }
+}
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+ if (nlane <= 2) {
+ npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
+ for (npy_uint64 i = 0; i < nlane; ++i) {
+ data[i] = ptr[i*stride];
+ }
+ return npyv_loada_s64(data);
+ }
+#else
+ if (nlane == 1) {
+ const __m128i vfill = npyv_setall_s64(fill);
+ return _mm_castpd_si128(
+ _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+ );
+ }
+#endif
+ return npyv_loadn_s64(ptr, stride);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return _mm_loadl_epi64((const __m128i*)ptr);
+ }
+ return npyv_loadn_s64(ptr, stride);
+}
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ switch(nlane) {
+ case 1:
+ *ptr = _mm_cvtsi128_si32(a);
+ break;
+ case 2:
+ _mm_storel_epi64((__m128i *)ptr, a);
+ break;
+ case 3:
+ _mm_storel_epi64((__m128i *)ptr, a);
+ #ifdef NPY_HAVE_SSE41
+ ptr[2] = _mm_extract_epi32(a, 2);
+ #else
+ ptr[2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+ #endif
+ break;
+ default:
+ npyv_store_s32(ptr, a);
+ }
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ _mm_storel_epi64((__m128i *)ptr, a);
+ return;
+ }
+ npyv_store_s64(ptr, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ switch(nlane) {
+#ifdef NPY_HAVE_SSE41
+ default:
+ ptr[stride*3] = _mm_extract_epi32(a, 3);
+ case 3:
+ ptr[stride*2] = _mm_extract_epi32(a, 2);
+ case 2:
+ ptr[stride*1] = _mm_extract_epi32(a, 1);
+#else
+ default:
+ ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+ case 3:
+ ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+ case 2:
+ ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+#endif
+ case 1:
+ ptr[stride*0] = _mm_cvtsi128_si32(a);
+ break;
+ }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ _mm_storel_epi64((__m128i *)ptr, a);
+ return;
+ }
+ npyv_storen_s64(ptr, stride, a);
+}
+/*****************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via casting
+ *****************************************************************/
+#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
#endif // _NPY_SIMD_SSE_MEMORY_H
diff --git a/numpy/core/src/common/simd/vsx/memory.h b/numpy/core/src/common/simd/vsx/memory.h
index e0d908bf9..08a0a9276 100644
--- a/numpy/core/src/common/simd/vsx/memory.h
+++ b/numpy/core/src/common/simd/vsx/memory.h
@@ -4,147 +4,343 @@
#ifndef _NPY_SIMD_VSX_MEMORY_H
#define _NPY_SIMD_VSX_MEMORY_H
+
+#include "misc.h"
+
/****************************
- * load/store
+ * Private utilities
****************************/
// TODO: test load by cast
#define VSX__CAST_lOAD 0
#if VSX__CAST_lOAD
- #define npyv__load(PTR, T_VEC) (*((T_VEC*)(PTR)))
+ #define npyv__load(T_VEC, PTR) (*((T_VEC*)(PTR)))
#else
/**
* CLANG fails to load unaligned addresses via vec_xl, vec_xst
* so we failback to vec_vsx_ld, vec_vsx_st
*/
#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
- #define npyv__load(PTR, T_VEC) vec_vsx_ld(0, PTR)
+ #define npyv__load(T_VEC, PTR) vec_vsx_ld(0, PTR)
#else
- #define npyv__load(PTR, T_VEC) vec_xl(0, PTR)
+ #define npyv__load(T_VEC, PTR) vec_xl(0, PTR)
#endif
#endif
-// unaligned load
-#define npyv_load_u8(PTR) npyv__load(PTR, npyv_u8)
-#define npyv_load_s8(PTR) npyv__load(PTR, npyv_s8)
-#define npyv_load_u16(PTR) npyv__load(PTR, npyv_u16)
-#define npyv_load_s16(PTR) npyv__load(PTR, npyv_s16)
-#define npyv_load_u32(PTR) npyv__load(PTR, npyv_u32)
-#define npyv_load_s32(PTR) npyv__load(PTR, npyv_s32)
-#define npyv_load_f32(PTR) npyv__load(PTR, npyv_f32)
-#define npyv_load_f64(PTR) npyv__load(PTR, npyv_f64)
-#if VSX__CAST_lOAD
- #define npyv_load_u64(PTR) npyv__load(PTR, npyv_u64)
- #define npyv_load_s64(PTR) npyv__load(PTR, npyv_s64)
+// unaligned store
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+ #define npyv__store(PTR, VEC) vec_vsx_st(VEC, 0, PTR)
#else
- #define npyv_load_u64(PTR) ((npyv_u64)npyv_load_u32((const unsigned int*)PTR))
- #define npyv_load_s64(PTR) ((npyv_s64)npyv_load_s32((const unsigned int*)PTR))
+ #define npyv__store(PTR, VEC) vec_xst(VEC, 0, PTR)
#endif
-// aligned load
-#define npyv_loada_u8(PTR) vec_ld(0, PTR)
-#define npyv_loada_s8 npyv_loada_u8
-#define npyv_loada_u16 npyv_loada_u8
-#define npyv_loada_s16 npyv_loada_u8
-#define npyv_loada_u32 npyv_loada_u8
-#define npyv_loada_s32 npyv_loada_u8
-#define npyv_loada_u64 npyv_load_u64
-#define npyv_loada_s64 npyv_load_s64
-#define npyv_loada_f32 npyv_loada_u8
-#define npyv_loada_f64 npyv_load_f64
-// stream load
-#define npyv_loads_u8 npyv_loada_u8
-#define npyv_loads_s8 npyv_loada_s8
-#define npyv_loads_u16 npyv_loada_u16
-#define npyv_loads_s16 npyv_loada_s16
-#define npyv_loads_u32 npyv_loada_u32
-#define npyv_loads_s32 npyv_loada_s32
-#define npyv_loads_u64 npyv_loada_u64
-#define npyv_loads_s64 npyv_loada_s64
-#define npyv_loads_f32 npyv_loada_f32
-#define npyv_loads_f64 npyv_loada_f64
-// load lower part
+
// avoid aliasing rules
#ifdef __cplusplus
template<typename T_PTR>
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(T_PTR *ptr)
- { return npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr; }
+ NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
+ { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
#else
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(void *ptr)
- { npy_uint64 *ptr64 = ptr; return ptr64; }
+ NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+ { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
#endif // __cplusplus
-#if defined(__clang__) && !defined(__IBMC__)
- // vec_promote doesn't support doubleword on clang
- #define npyv_loadl_u64(PTR) npyv_setall_u64(*npyv__ptr2u64(PTR))
-#else
- #define npyv_loadl_u64(PTR) vec_promote(*npyv__ptr2u64(PTR), 0)
-#endif
-#define npyv_loadl_u8(PTR) ((npyv_u8)npyv_loadl_u64(PTR))
-#define npyv_loadl_s8(PTR) ((npyv_s8)npyv_loadl_u64(PTR))
-#define npyv_loadl_u16(PTR) ((npyv_u16)npyv_loadl_u64(PTR))
-#define npyv_loadl_s16(PTR) ((npyv_s16)npyv_loadl_u64(PTR))
-#define npyv_loadl_u32(PTR) ((npyv_u32)npyv_loadl_u64(PTR))
-#define npyv_loadl_s32(PTR) ((npyv_s32)npyv_loadl_u64(PTR))
-#define npyv_loadl_s64(PTR) ((npyv_s64)npyv_loadl_u64(PTR))
-#define npyv_loadl_f32(PTR) ((npyv_f32)npyv_loadl_u64(PTR))
-#define npyv_loadl_f64(PTR) ((npyv_f64)npyv_loadl_u64(PTR))
-// unaligned store
-#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
- #define npyv_store_u8(PTR, VEC) vec_vsx_st(VEC, 0, PTR)
-#else
- #define npyv_store_u8(PTR, VEC) vec_xst(VEC, 0, PTR)
-#endif
-#define npyv_store_s8 npyv_store_u8
-#define npyv_store_u16 npyv_store_u8
-#define npyv_store_s16 npyv_store_u8
-#define npyv_store_u32 npyv_store_u8
-#define npyv_store_s32 npyv_store_u8
-#define npyv_store_u64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
-#define npyv_store_s64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
-#define npyv_store_f32 npyv_store_u8
-#define npyv_store_f64 npyv_store_u8
-// aligned store
-#define npyv_storea_u8(PTR, VEC) vec_st(VEC, 0, PTR)
-#define npyv_storea_s8 npyv_storea_u8
-#define npyv_storea_u16 npyv_storea_u8
-#define npyv_storea_s16 npyv_storea_u8
-#define npyv_storea_u32 npyv_storea_u8
-#define npyv_storea_s32 npyv_storea_u8
-#define npyv_storea_u64 npyv_store_u64
-#define npyv_storea_s64 npyv_store_s64
-#define npyv_storea_f32 npyv_storea_u8
-#define npyv_storea_f64 npyv_store_f64
-// stream store
-#define npyv_stores_u8 npyv_storea_u8
-#define npyv_stores_s8 npyv_storea_s8
-#define npyv_stores_u16 npyv_storea_u16
-#define npyv_stores_s16 npyv_storea_s16
-#define npyv_stores_u32 npyv_storea_u32
-#define npyv_stores_s32 npyv_storea_s32
-#define npyv_stores_u64 npyv_storea_u64
-#define npyv_stores_s64 npyv_storea_s64
-#define npyv_stores_f32 npyv_storea_f32
-#define npyv_stores_f64 npyv_storea_f64
+
+// load lower part
+NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
+{
+ #if defined(__clang__) && !defined(__IBMC__)
+ // vec_promote doesn't support doubleword on clang
+ return npyv_setall_u64(*npyv__ptr2u64(ptr));
+ #else
+ return vec_promote(*npyv__ptr2u64(ptr), 0);
+ #endif
+}
// store lower part
-#define npyv_storel_u8(PTR, VEC) \
+#define npyv__storel(PTR, VEC) \
*npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 0)
-#define npyv_storel_s8 npyv_storel_u8
-#define npyv_storel_u16 npyv_storel_u8
-#define npyv_storel_s16 npyv_storel_u8
-#define npyv_storel_u32 npyv_storel_u8
-#define npyv_storel_s32 npyv_storel_u8
-#define npyv_storel_s64 npyv_storel_u8
-#define npyv_storel_u64 npyv_storel_u8
-#define npyv_storel_f32 npyv_storel_u8
-#define npyv_storel_f64 npyv_storel_u8
-// store higher part
-#define npyv_storeh_u8(PTR, VEC) \
+
+#define npyv__storeh(PTR, VEC) \
*npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 1)
-#define npyv_storeh_s8 npyv_storeh_u8
-#define npyv_storeh_u16 npyv_storeh_u8
-#define npyv_storeh_s16 npyv_storeh_u8
-#define npyv_storeh_u32 npyv_storeh_u8
-#define npyv_storeh_s32 npyv_storeh_u8
-#define npyv_storeh_s64 npyv_storeh_u8
-#define npyv_storeh_u64 npyv_storeh_u8
-#define npyv_storeh_f32 npyv_storeh_u8
-#define npyv_storeh_f64 npyv_storeh_u8
+
+/****************************
+ * load/store
+ ****************************/
+#define NPYV_IMPL_VSX_MEM(SFX, DW_CAST) \
+ NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr) \
+ { return (npyv_##SFX)npyv__load(npyv_##SFX, (const npyv_lanetype_##DW_CAST*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr) \
+ { return (npyv_##SFX)vec_ld(0, (const npyv_lanetype_u32*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr) \
+ { return npyv_loada_##SFX(ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr) \
+ { return (npyv_##SFX)npyv__loadl(ptr); } \
+ NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { npyv__store((npyv_lanetype_##DW_CAST*)ptr, (npyv_##DW_CAST)vec); } \
+ NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { vec_st((npyv_u32)vec, 0, (npyv_lanetype_u32*)ptr); } \
+ NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { npyv_storea_##SFX(ptr, vec); } \
+ NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { npyv__storel(ptr, vec); } \
+ NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { npyv__storeh(ptr, vec); }
+
+NPYV_IMPL_VSX_MEM(u8, u8)
+NPYV_IMPL_VSX_MEM(s8, s8)
+NPYV_IMPL_VSX_MEM(u16, u16)
+NPYV_IMPL_VSX_MEM(s16, s16)
+NPYV_IMPL_VSX_MEM(u32, u32)
+NPYV_IMPL_VSX_MEM(s32, s32)
+NPYV_IMPL_VSX_MEM(u64, f64)
+NPYV_IMPL_VSX_MEM(s64, f64)
+NPYV_IMPL_VSX_MEM(f32, f32)
+NPYV_IMPL_VSX_MEM(f64, f64)
+
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+ return npyv_set_u32(
+ ptr[stride * 0], ptr[stride * 1],
+ ptr[stride * 2], ptr[stride * 3]
+ );
+}
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{ return (npyv_s32)npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return (npyv_f32)npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+//// 64
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return npyv_set_u64(ptr[0], ptr[stride]); }
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_set_s64(ptr[0], ptr[stride]); }
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return npyv_set_f64(ptr[0], ptr[stride]); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ ptr[stride * 0] = vec_extract(a, 0);
+ ptr[stride * 1] = vec_extract(a, 1);
+ ptr[stride * 2] = vec_extract(a, 2);
+ ptr[stride * 3] = vec_extract(a, 3);
+}
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+//// 64
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+ ptr[stride * 0] = vec_extract(a, 0);
+ ptr[stride * 1] = vec_extract(a, 1);
+}
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+ npyv_s32 vfill = npyv_setall_s32(fill);
+ switch(nlane) {
+ case 1:
+ return vec_insert(ptr[0], vfill, 0);
+ case 2:
+ return (npyv_s32)vec_insert(
+ *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0
+ );
+ case 3:
+ vfill = vec_insert(ptr[2], vfill, 2);
+ return (npyv_s32)vec_insert(
+ *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0
+ );
+ default:
+ return npyv_load_s32(ptr);
+ }
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s32(ptr, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return npyv_set_s64(ptr[0], fill);
+ }
+ return npyv_load_s64(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s64(ptr, nlane, 0); }
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+ assert(nlane > 0);
+ npyv_s32 vfill = npyv_setall_s32(fill);
+ switch(nlane) {
+ case 3:
+ vfill = vec_insert(ptr[stride*2], vfill, 2);
+ case 2:
+ vfill = vec_insert(ptr[stride], vfill, 1);
+ case 1:
+ vfill = vec_insert(*ptr, vfill, 0);
+ break;
+ default:
+ return npyv_loadn_s32(ptr, stride);
+ } // switch
+ return vfill;
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return npyv_set_s64(*ptr, fill);
+ }
+ return npyv_loadn_s64(ptr, stride);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ switch(nlane) {
+ case 1:
+ *ptr = vec_extract(a, 0);
+ break;
+ case 2:
+ npyv_storel_s32(ptr, a);
+ break;
+ case 3:
+ npyv_storel_s32(ptr, a);
+ ptr[2] = vec_extract(a, 2);
+ break;
+ default:
+ npyv_store_s32(ptr, a);
+ }
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ npyv_storel_s64(ptr, a);
+ return;
+ }
+ npyv_store_s64(ptr, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ switch(nlane) {
+ default:
+ ptr[stride*3] = vec_extract(a, 3);
+ case 3:
+ ptr[stride*2] = vec_extract(a, 2);
+ case 2:
+ ptr[stride*1] = vec_extract(a, 1);
+ case 1:
+ ptr[stride*0] = vec_extract(a, 0);
+ break;
+ }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ npyv_storel_s64(ptr, a);
+ return;
+ }
+ npyv_storen_s64(ptr, stride, a);
+}
+/*****************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via casting
+ *****************************************************************/
+#define NPYV_IMPL_VSX_REST_PARTIAL_TYPES(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill) \
+ { \
+ union { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ } pun = {.from_##F_SFX = fill}; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f64, s64)
#endif // _NPY_SIMD_VSX_MEMORY_H