summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/src/common/simd/avx2/memory.h24
-rw-r--r--numpy/core/src/common/simd/avx2/misc.h27
-rw-r--r--numpy/core/src/common/simd/avx512/math.h2
-rw-r--r--numpy/core/src/common/simd/avx512/memory.h12
-rw-r--r--numpy/core/src/common/simd/avx512/misc.h32
-rw-r--r--numpy/core/src/common/simd/simd.h19
-rw-r--r--numpy/core/src/common/simd/sse/misc.h25
7 files changed, 116 insertions, 25 deletions
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index e27bf15fe..5891a270a 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -87,7 +87,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
#if 0 // slower
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{
- const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+ const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
}
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
@@ -170,9 +170,9 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
- const __m256i vfill = _mm256_set1_epi64x(fill);
- const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
- __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+ const __m256i vfill = npyv_setall_s64(fill);
+ const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+ __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
@@ -181,8 +181,8 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
- const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
- __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+ const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+ __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_maskload_epi64((const void*)ptr, mask);
}
@@ -211,10 +211,10 @@ NPY_FINLINE npyv_s64
npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
- const __m256i vfill = _mm256_set1_epi64x(fill);
- const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
- const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
- __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+ const __m256i vfill = npyv_setall_s64(fill);
+ const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
+ const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+ __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
}
@@ -238,8 +238,8 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
{
assert(nlane > 0);
- const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
- __m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
+ const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+ __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
_mm256_maskstore_epi64((void*)ptr, mask, a);
}
diff --git a/numpy/core/src/common/simd/avx2/misc.h b/numpy/core/src/common/simd/avx2/misc.h
index e96696dc9..5e91e91b3 100644
--- a/numpy/core/src/common/simd/avx2/misc.h
+++ b/numpy/core/src/common/simd/avx2/misc.h
@@ -24,11 +24,27 @@
#define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL)
#define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL)
#define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL)
-#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL)
-#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL)
#define npyv_setall_f32(VAL) _mm256_set1_ps(VAL)
#define npyv_setall_f64(VAL) _mm256_set1_pd(VAL)
+NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64);
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+ npy_int64 ai = (npy_int64)a;
+#if defined(_MSC_VER) && defined(_M_IX86)
+ return npyv__setr_epi64(ai, ai, ai, ai);
+#else
+ return _mm256_set1_epi64x(ai);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+ return npyv__setr_epi64(a, a, a, a);
+#else
+ return _mm256_set1_epi64x(a);
+#endif
+}
/*
* vector with specific values set to each lane and
* set a specific value to all remained lanes
@@ -59,7 +75,14 @@ NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int
}
NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3)
{
+#if defined(_MSC_VER) && defined(_M_IX86)
+ return _mm256_setr_epi32(
+ (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
+ (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32)
+ );
+#else
return _mm256_setr_epi64x(i0, i1, i2, i3);
+#endif
}
NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5,
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
index 0141396d0..0949b2b06 100644
--- a/numpy/core/src/common/simd/avx512/math.h
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -35,7 +35,7 @@ NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
return _mm512_range_pd(a, a, 8);
#else
return npyv_and_f64(
- a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
+ a, _mm512_castsi512_pd(npyv_setall_s64(0x7fffffffffffffffLL))
);
#endif
}
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index bffd6e907..47095bf72 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -110,7 +110,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
//// 64
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{
- const __m512i idx = _mm512_setr_epi64(
+ const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
@@ -140,7 +140,7 @@ NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
//// 64
NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
{
- const __m512i idx = _mm512_setr_epi64(
+ const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
@@ -173,7 +173,7 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
- const __m512i vfill = _mm512_set1_epi64(fill);
+ const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
}
@@ -210,11 +210,11 @@ NPY_FINLINE npyv_s64
npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
- const __m512i idx = _mm512_setr_epi64(
+ const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
- const __m512i vfill = _mm512_set1_epi64(fill);
+ const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
}
@@ -258,7 +258,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
{
assert(nlane > 0);
- const __m512i idx = _mm512_setr_epi64(
+ const __m512i idx = npyv_set_s64(
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
diff --git a/numpy/core/src/common/simd/avx512/misc.h b/numpy/core/src/common/simd/avx512/misc.h
index 4b6729b05..c3039ecfe 100644
--- a/numpy/core/src/common/simd/avx512/misc.h
+++ b/numpy/core/src/common/simd/avx512/misc.h
@@ -24,11 +24,30 @@
#define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL)
#define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL)
#define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL)
-#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL)
-#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL)
#define npyv_setall_f32(VAL) _mm512_set1_ps(VAL)
#define npyv_setall_f64(VAL) _mm512_set1_pd(VAL)
+NPY_FINLINE __m512i npyv__setr_epi64(
+ npy_int64, npy_int64, npy_int64, npy_int64,
+ npy_int64, npy_int64, npy_int64, npy_int64
+);
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+ npy_int64 ai = (npy_int64)a;
+#if defined(_MSC_VER) && defined(_M_IX86)
+ return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai);
+#else
+ return _mm512_set1_epi64(ai);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+ return npyv__setr_epi64(a, a, a, a, a, a, a, a);
+#else
+ return _mm512_set1_epi64(a);
+#endif
+}
/**
* vector with specific values set to each lane and
* set a specific value to all remained lanes
@@ -76,7 +95,16 @@ NPY_FINLINE __m512i npyv__setr_epi32(
NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3,
npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7)
{
+#if defined(_MSC_VER) && defined(_M_IX86)
+ return _mm512_setr_epi32(
+ (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
+ (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32),
+ (int)i4, (int)(i4 >> 32), (int)i5, (int)(i5 >> 32),
+ (int)i6, (int)(i6 >> 32), (int)i7, (int)(i7 >> 32)
+ );
+#else
return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
+#endif
}
NPY_FINLINE __m512 npyv__setr_ps(
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index a3e2b95de..08b2a7d00 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -27,6 +27,25 @@ typedef npy_int64 npyv_lanetype_s64;
typedef float npyv_lanetype_f32;
typedef double npyv_lanetype_f64;
+#if defined(_MSC_VER) && defined(_M_IX86)
+/*
+ * Avoid using any of the following intrinsics with MSVC 32-bit,
+ * even if they are apparently work on newer versions.
+ * They had bad impact on the generated instructions,
+ * sometimes the compiler deal with them without the respect
+ * of 32-bit mode which lead to crush due to execute 64-bit
+ * instructions and other times generate bad emulated instructions.
+ */
+ #undef _mm512_set1_epi64
+ #undef _mm256_set1_epi64x
+ #undef _mm_set1_epi64x
+ #undef _mm512_setr_epi64x
+ #undef _mm256_setr_epi64x
+ #undef _mm_setr_epi64x
+ #undef _mm512_set_epi64x
+ #undef _mm256_set_epi64x
+ #undef _mm_set_epi64x
+#endif
#if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128)
#include "avx512/avx512.h"
#elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128)
diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h
index 1099c491d..7d13fbf55 100644
--- a/numpy/core/src/common/simd/sse/misc.h
+++ b/numpy/core/src/common/simd/sse/misc.h
@@ -24,11 +24,28 @@
#define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL))
#define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL))
#define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL))
-#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
-#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
#define npyv_setall_f32 _mm_set1_ps
#define npyv_setall_f64 _mm_set1_pd
+NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1);
+
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+ return npyv__setr_epi64((npy_int64)a, (npy_int64)a);
+#else
+ return _mm_set1_epi64x((npy_int64)a);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+ return npyv__setr_epi64(a, a);
+#else
+ return _mm_set1_epi64x((npy_int64)a);
+#endif
+}
+
/**
* vector with specific values set to each lane and
* set a specific value to all remained lanes
@@ -53,7 +70,11 @@ NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3)
}
NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1)
{
+#if defined(_MSC_VER) && defined(_M_IX86)
+ return _mm_setr_epi32((int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32));
+#else
return _mm_set_epi64x(i1, i0);
+#endif
}
NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3)
{