diff options
| -rw-r--r-- | numpy/core/src/umath/loops_minmax.dispatch.c.src | 68 |
1 files changed, 53 insertions, 15 deletions
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src index c2eddcee6..891be445e 100644 --- a/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -1,6 +1,8 @@ /*@targets ** $maxopt baseline ** neon asimd + ** sse2 avx2 avx512_skx + ** vsx2 **/ #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -20,7 +22,6 @@ #define scalar_max_i(A, B) ((A > B) ? A : B) #define scalar_min_i(A, B) ((A < B) ? A : B) // fp, propagates NaNs -#if !defined(__aarch64__) #define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B) #define scalar_max_d scalar_max_f #define scalar_max_l scalar_max_f @@ -34,28 +35,61 @@ #define scalar_minp_f fminf #define scalar_minp_d fmin #define scalar_minp_l fminl -#elif defined(__aarch64__) + +// special optimization for fp scalars propagates NaNs +// since there're no C99 support for it +#ifndef NPY_DISABLE_OPTIMIZATION /**begin repeat - * #type = npy_float, npy_double, npy_longdouble# - * #scalar_sfx = f, d, l# - * #asm_sfx = s, d, d# + * #type = npy_float, npy_double# + * #sfx = f32, f64# + * #c_sfx = f, d# + * #isa_sfx = s, d# + * #sse_type = __m128, __m128d# */ /**begin repeat1 - * #op = max, min, maxp, minp# - * #asm_instr = fmax, fmin, fmaxnm, fminnm# + * #op = max, min# + * #neon_instr = fmax, fmin# */ -static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ +#ifdef NPY_HAVE_SSE2 +#undef scalar_@op@_@c_sfx@ +NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { + @sse_type@ va = _mm_set_s@isa_sfx@(a); + @sse_type@ vb = _mm_set_s@isa_sfx@(b); + @sse_type@ rv = _mm_@op@_s@isa_sfx@(va, vb); + // X86 handel second operand + @sse_type@ nn = _mm_cmpord_s@isa_sfx@(va, va); + #ifdef NPY_HAVE_SSE41 + rv = _mm_blendv_p@isa_sfx@(va, rv, nn); + #else + rv = _mm_xor_p@isa_sfx@(va, _mm_and_p@isa_sfx@(_mm_xor_p@isa_sfx@(va, rv), nn)); + #endif + return _mm_cvts@isa_sfx@_@sfx@(rv); +} +#endif // SSE2 +#ifdef __aarch64__ +#undef scalar_@op@_@c_sfx@ +NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { @type@ result = 0; __asm( - "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]" + "@neon_instr@ %@isa_sfx@[result], %@isa_sfx@[a], %@isa_sfx@[b]" : [result] "=w" (result) : [a] "w" (a), [b] "w" (b) ); return result; } +#endif // __aarch64__ /**end repeat1**/ /**end repeat**/ -#endif // !defined(__aarch64__) +#endif // NPY_DISABLE_OPTIMIZATION +// mapping to double if its possible +#if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE +/**begin repeat + * #op = max, min, maxp, minp# + */ + #undef scalar_@op@_l + #define scalar_@op@_l scalar_@op@_d +/**end repeat**/ +#endif /******************************************************************************* ** extra SIMD intrinsics @@ -92,7 +126,7 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ */ NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) { - npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@]; npyv_storea_@sfx@(s, v); npyv_lanetype_@sfx@ result = s[0]; for(int i=1; i<npyv_nlanes_@sfx@; ++i){ @@ -127,7 +161,11 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b) { npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b); + // result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b); + // X86 handle second operand + #ifndef NPY_HAVE_SSE2 result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b); + #endif result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a); return result; } @@ -138,7 +176,7 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ */ NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) { - npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@]; npyv_storea_@sfx@(s, v); npyv_lanetype_@sfx@ result = s[0]; for(int i=1; i<npyv_nlanes_@sfx@; ++i){ @@ -315,10 +353,10 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety * FLOAT, DOUBLE, LONGDOUBLE# * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * float, double, long double# + * npy_float, npy_double, npy_longdouble# * * #is_fp = 0*10, 1*3# - * #is_unsigend = 1*5, 0*5, 0*3# + * #is_unsigned = 1*5, 0*5, 0*3# * #scalar_sfx = i*10, f, d, l# */ #undef TO_SIMD_SFX @@ -332,7 +370,7 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 #undef TO_SIMD_SFX #endif - #elif @is_unsigend@ + #elif @is_unsigned@ #define TO_SIMD_SFX(X) X##_u@len@ #else #define TO_SIMD_SFX(X) X##_s@len@ |
