summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/src/umath/loops_minmax.dispatch.c.src68
1 files changed, 53 insertions, 15 deletions
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index c2eddcee6..891be445e 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -1,6 +1,8 @@
/*@targets
** $maxopt baseline
** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
**/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
@@ -20,7 +22,6 @@
#define scalar_max_i(A, B) ((A > B) ? A : B)
#define scalar_min_i(A, B) ((A < B) ? A : B)
// fp, propagates NaNs
-#if !defined(__aarch64__)
#define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B)
#define scalar_max_d scalar_max_f
#define scalar_max_l scalar_max_f
@@ -34,28 +35,61 @@
#define scalar_minp_f fminf
#define scalar_minp_d fmin
#define scalar_minp_l fminl
-#elif defined(__aarch64__)
+
+// special optimization for fp scalars propagates NaNs
+// since there're no C99 support for it
+#ifndef NPY_DISABLE_OPTIMIZATION
/**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #scalar_sfx = f, d, l#
- * #asm_sfx = s, d, d#
+ * #type = npy_float, npy_double#
+ * #sfx = f32, f64#
+ * #c_sfx = f, d#
+ * #isa_sfx = s, d#
+ * #sse_type = __m128, __m128d#
*/
/**begin repeat1
- * #op = max, min, maxp, minp#
- * #asm_instr = fmax, fmin, fmaxnm, fminnm#
+ * #op = max, min#
+ * #neon_instr = fmax, fmin#
*/
-static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
+#ifdef NPY_HAVE_SSE2
+#undef scalar_@op@_@c_sfx@
+NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
+ @sse_type@ va = _mm_set_s@isa_sfx@(a);
+ @sse_type@ vb = _mm_set_s@isa_sfx@(b);
+ @sse_type@ rv = _mm_@op@_s@isa_sfx@(va, vb);
+ // X86 handel second operand
+ @sse_type@ nn = _mm_cmpord_s@isa_sfx@(va, va);
+ #ifdef NPY_HAVE_SSE41
+ rv = _mm_blendv_p@isa_sfx@(va, rv, nn);
+ #else
+ rv = _mm_xor_p@isa_sfx@(va, _mm_and_p@isa_sfx@(_mm_xor_p@isa_sfx@(va, rv), nn));
+ #endif
+ return _mm_cvts@isa_sfx@_@sfx@(rv);
+}
+#endif // SSE2
+#ifdef __aarch64__
+#undef scalar_@op@_@c_sfx@
+NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
@type@ result = 0;
__asm(
- "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]"
+ "@neon_instr@ %@isa_sfx@[result], %@isa_sfx@[a], %@isa_sfx@[b]"
: [result] "=w" (result)
: [a] "w" (a), [b] "w" (b)
);
return result;
}
+#endif // __aarch64__
/**end repeat1**/
/**end repeat**/
-#endif // !defined(__aarch64__)
+#endif // NPY_DISABLE_OPTIMIZATION
+// mapping to double if its possible
+#if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE
+/**begin repeat
+ * #op = max, min, maxp, minp#
+ */
+ #undef scalar_@op@_l
+ #define scalar_@op@_l scalar_@op@_d
+/**end repeat**/
+#endif
/*******************************************************************************
** extra SIMD intrinsics
@@ -92,7 +126,7 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
*/
NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
{
- npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@];
+ npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@];
npyv_storea_@sfx@(s, v);
npyv_lanetype_@sfx@ result = s[0];
for(int i=1; i<npyv_nlanes_@sfx@; ++i){
@@ -127,7 +161,11 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b)
{
npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b);
+ // result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
+ // X86 handle second operand
+ #ifndef NPY_HAVE_SSE2
result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
+ #endif
result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a);
return result;
}
@@ -138,7 +176,7 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
*/
NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
{
- npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@];
+ npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@];
npyv_storea_@sfx@(s, v);
npyv_lanetype_@sfx@ result = s[0];
for(int i=1; i<npyv_nlanes_@sfx@; ++i){
@@ -315,10 +353,10 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety
* FLOAT, DOUBLE, LONGDOUBLE#
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
* npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- * float, double, long double#
+ * npy_float, npy_double, npy_longdouble#
*
* #is_fp = 0*10, 1*3#
- * #is_unsigend = 1*5, 0*5, 0*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
* #scalar_sfx = i*10, f, d, l#
*/
#undef TO_SIMD_SFX
@@ -332,7 +370,7 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety
#if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
#undef TO_SIMD_SFX
#endif
- #elif @is_unsigend@
+ #elif @is_unsigned@
#define TO_SIMD_SFX(X) X##_u@len@
#else
#define TO_SIMD_SFX(X) X##_s@len@