diff options
| -rw-r--r-- | numpy/core/src/common/simd/avx2/memory.h | 24 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx2/misc.h | 27 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/math.h | 2 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/memory.h | 12 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/misc.h | 32 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/simd.h | 19 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/misc.h | 25 |
7 files changed, 116 insertions, 25 deletions
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h index e27bf15fe..5891a270a 100644 --- a/numpy/core/src/common/simd/avx2/memory.h +++ b/numpy/core/src/common/simd/avx2/memory.h @@ -87,7 +87,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) #if 0 // slower NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) { - const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride); + const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride); return _mm256_i64gather_epi64((const void*)ptr, idx, 8); } NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) @@ -170,9 +170,9 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) { assert(nlane > 0); - const __m256i vfill = _mm256_set1_epi64x(fill); - const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3); - __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane); + const __m256i vfill = npyv_setall_s64(fill); + const __m256i steps = npyv_set_s64(0, 1, 2, 3); + __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); return _mm256_blendv_epi8(vfill, payload, mask); @@ -181,8 +181,8 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { assert(nlane > 0); - const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3); - __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane); + const __m256i steps = npyv_set_s64(0, 1, 2, 3); + __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); return _mm256_maskload_epi64((const void*)ptr, mask); } @@ -211,10 +211,10 @@ NPY_FINLINE npyv_s64 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) { assert(nlane > 0); - const __m256i vfill = _mm256_set1_epi64x(fill); - const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride); - const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3); - __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane); + const __m256i vfill = npyv_setall_s64(fill); + const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride); + const __m256i steps = npyv_set_s64(0, 1, 2, 3); + __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8); } @@ -238,8 +238,8 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) { assert(nlane > 0); - const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3); - __m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane); + const __m256i steps = npyv_set_s64(0, 1, 2, 3); + __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); _mm256_maskstore_epi64((void*)ptr, mask, a); } diff --git a/numpy/core/src/common/simd/avx2/misc.h b/numpy/core/src/common/simd/avx2/misc.h index e96696dc9..5e91e91b3 100644 --- a/numpy/core/src/common/simd/avx2/misc.h +++ b/numpy/core/src/common/simd/avx2/misc.h @@ -24,11 +24,27 @@ #define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL) #define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL) #define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL) -#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL) -#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL) #define npyv_setall_f32(VAL) _mm256_set1_ps(VAL) #define npyv_setall_f64(VAL) _mm256_set1_pd(VAL) +NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64); +NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a) +{ + npy_int64 ai = (npy_int64)a; +#if defined(_MSC_VER) && defined(_M_IX86) + return npyv__setr_epi64(ai, ai, ai, ai); +#else + return _mm256_set1_epi64x(ai); +#endif +} +NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a) +{ +#if defined(_MSC_VER) && defined(_M_IX86) + return npyv__setr_epi64(a, a, a, a); +#else + return _mm256_set1_epi64x(a); +#endif +} /* * vector with specific values set to each lane and * set a specific value to all remained lanes @@ -59,7 +75,14 @@ NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int } NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3) { +#if defined(_MSC_VER) && defined(_M_IX86) + return _mm256_setr_epi32( + (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32), + (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32) + ); +#else return _mm256_setr_epi64x(i0, i1, i2, i3); +#endif } NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5, diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h index 0141396d0..0949b2b06 100644 --- a/numpy/core/src/common/simd/avx512/math.h +++ b/numpy/core/src/common/simd/avx512/math.h @@ -35,7 +35,7 @@ NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) return _mm512_range_pd(a, a, 8); #else return npyv_and_f64( - a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL)) + a, _mm512_castsi512_pd(npyv_setall_s64(0x7fffffffffffffffLL)) ); #endif } diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h index bffd6e907..47095bf72 100644 --- a/numpy/core/src/common/simd/avx512/memory.h +++ b/numpy/core/src/common/simd/avx512/memory.h @@ -110,7 +110,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) //// 64 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) { - const __m512i idx = _mm512_setr_epi64( + const __m512i idx = npyv_set_s64( 0*stride, 1*stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride ); @@ -140,7 +140,7 @@ NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) //// 64 NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) { - const __m512i idx = _mm512_setr_epi64( + const __m512i idx = npyv_set_s64( 0*stride, 1*stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride ); @@ -173,7 +173,7 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) { assert(nlane > 0); - const __m512i vfill = _mm512_set1_epi64(fill); + const __m512i vfill = npyv_setall_s64(fill); const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1; return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr); } @@ -210,11 +210,11 @@ NPY_FINLINE npyv_s64 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) { assert(nlane > 0); - const __m512i idx = _mm512_setr_epi64( + const __m512i idx = npyv_set_s64( 0*stride, 1*stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride ); - const __m512i vfill = _mm512_set1_epi64(fill); + const __m512i vfill = npyv_setall_s64(fill); const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1; return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8); } @@ -258,7 +258,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) { assert(nlane > 0); - const __m512i idx = _mm512_setr_epi64( + const __m512i idx = npyv_set_s64( 0*stride, 1*stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride ); diff --git a/numpy/core/src/common/simd/avx512/misc.h b/numpy/core/src/common/simd/avx512/misc.h index 4b6729b05..c3039ecfe 100644 --- a/numpy/core/src/common/simd/avx512/misc.h +++ b/numpy/core/src/common/simd/avx512/misc.h @@ -24,11 +24,30 @@ #define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL) #define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL) #define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL) -#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL) -#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL) #define npyv_setall_f32(VAL) _mm512_set1_ps(VAL) #define npyv_setall_f64(VAL) _mm512_set1_pd(VAL) +NPY_FINLINE __m512i npyv__setr_epi64( + npy_int64, npy_int64, npy_int64, npy_int64, + npy_int64, npy_int64, npy_int64, npy_int64 +); +NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a) +{ + npy_int64 ai = (npy_int64)a; +#if defined(_MSC_VER) && defined(_M_IX86) + return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai); +#else + return _mm512_set1_epi64(ai); +#endif +} +NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a) +{ +#if defined(_MSC_VER) && defined(_M_IX86) + return npyv__setr_epi64(a, a, a, a, a, a, a, a); +#else + return _mm512_set1_epi64(a); +#endif +} /** * vector with specific values set to each lane and * set a specific value to all remained lanes @@ -76,7 +95,16 @@ NPY_FINLINE __m512i npyv__setr_epi32( NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3, npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7) { +#if defined(_MSC_VER) && defined(_M_IX86) + return _mm512_setr_epi32( + (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32), + (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32), + (int)i4, (int)(i4 >> 32), (int)i5, (int)(i5 >> 32), + (int)i6, (int)(i6 >> 32), (int)i7, (int)(i7 >> 32) + ); +#else return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7); +#endif } NPY_FINLINE __m512 npyv__setr_ps( diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index a3e2b95de..08b2a7d00 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -27,6 +27,25 @@ typedef npy_int64 npyv_lanetype_s64; typedef float npyv_lanetype_f32; typedef double npyv_lanetype_f64; +#if defined(_MSC_VER) && defined(_M_IX86) +/* + * Avoid using any of the following intrinsics with MSVC 32-bit, + * even if they are apparently work on newer versions. + * They had bad impact on the generated instructions, + * sometimes the compiler deal with them without the respect + * of 32-bit mode which lead to crush due to execute 64-bit + * instructions and other times generate bad emulated instructions. + */ + #undef _mm512_set1_epi64 + #undef _mm256_set1_epi64x + #undef _mm_set1_epi64x + #undef _mm512_setr_epi64x + #undef _mm256_setr_epi64x + #undef _mm_setr_epi64x + #undef _mm512_set_epi64x + #undef _mm256_set_epi64x + #undef _mm_set_epi64x +#endif #if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128) #include "avx512/avx512.h" #elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128) diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h index 1099c491d..7d13fbf55 100644 --- a/numpy/core/src/common/simd/sse/misc.h +++ b/numpy/core/src/common/simd/sse/misc.h @@ -24,11 +24,28 @@ #define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL)) #define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL)) #define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL)) -#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL)) -#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL)) #define npyv_setall_f32 _mm_set1_ps #define npyv_setall_f64 _mm_set1_pd +NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1); + +NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a) +{ +#if defined(_MSC_VER) && defined(_M_IX86) + return npyv__setr_epi64((npy_int64)a, (npy_int64)a); +#else + return _mm_set1_epi64x((npy_int64)a); +#endif +} +NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a) +{ +#if defined(_MSC_VER) && defined(_M_IX86) + return npyv__setr_epi64(a, a); +#else + return _mm_set1_epi64x((npy_int64)a); +#endif +} + /** * vector with specific values set to each lane and * set a specific value to all remained lanes @@ -53,7 +70,11 @@ NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3) } NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1) { +#if defined(_MSC_VER) && defined(_M_IX86) + return _mm_setr_epi32((int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32)); +#else return _mm_set_epi64x(i1, i0); +#endif } NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3) { |
