diff options
author | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2021-12-13 12:55:07 -0800 |
---|---|---|
committer | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2021-12-13 12:55:07 -0800 |
commit | 0d171288ecd9ccfee739b15faa58de8243ae4a53 (patch) | |
tree | 72facb85b26d83a7c89172e0f43cdb74218ab362 /numpy | |
parent | 9fe353e048bc317099bb13cae4cb5a4de8c0c656 (diff) | |
download | numpy-0d171288ecd9ccfee739b15faa58de8243ae4a53.tar.gz |
Integrate requested changes, improve scalar operations, address linux aarch64
We've incorporated the changes you've requested for scalar operations.
**Testing**
- Apple silicon M1 native (arm64 / aarch64) -- No test failures
- Apple silicon M1 Rosetta (x86_64) -- No new test failures
- iMacPro1,1 (AVX512F) -- No test failures
- Ubuntu VM (aarch64) -- No test failures
**Benchmarks**
Again, Apple silicon M1 native (arm64 / aarch64) looks similar to original patch (comparison below)
Also, x86_64 (both Apple silicon M1 Rosetta and iMacPro1,1 AVX512F) have varying results. Some are better. Some are worse. Compared to previous re-org, we see improvements though.
Apple silicon M1 native (arm64 / aarch64) comparison to previous commit:
```
before after ratio
[8b01e839] [18565b27]
<gh-issue-17989/feedback/round-1> <gh-issue-17989/feedback/round-2>
+ 176±0.2μs 196±1μs 1.11 bench_function_base.Sort.time_sort('heap', 'int16', ('ordered',))
+ 234±0.2μs 261±1μs 1.11 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'exp'>, 2, 4, 'f')
+ 43.4±0.4μs 48.3±0.4μs 1.11 bench_function_base.Sort.time_sort('quick', 'int64', ('uniform',))
+ 22.5±0.1μs 25.1±0.3μs 1.11 bench_shape_base.Block2D.time_block2d((512, 512), 'uint8', (2, 2))
+ 4.75±0.05μs 5.28±0.07μs 1.11 bench_ma.UFunc.time_scalar(True, True, 1000)
+ 224±0.2μs 248±0.9μs 1.11 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'exp2'>, 1, 1, 'f')
+ 233±0.5μs 258±1μs 1.11 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'exp'>, 4, 2, 'f')
+ 8.81±0.02μs 9.72±0.1μs 1.10 bench_shape_base.Block2D.time_block2d((32, 32), 'uint16', (2, 2))
+ 8.71±0.1μs 9.58±0.3μs 1.10 bench_indexing.ScalarIndexing.time_assign_cast(2)
+ 96.2±0.03μs 105±3μs 1.09 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'fabs'>, 1, 1, 'd')
+ 20.2±0.1μs 22.0±0.5μs 1.09 bench_shape_base.Block.time_block_simple_row_wise(100)
+ 469±4μs 510±7μs 1.09 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'cos'>, 2, 1, 'd')
+ 43.9±0.02μs 46.4±2μs 1.06 bench_function_base.Median.time_odd_inplace
+ 4.75±0μs 5.02±0.2μs 1.06 bench_itemselection.Take.time_contiguous((2, 1000, 1), 'raise', 'int64')
- 16.4±0.07μs 15.6±0.4μs 0.95 bench_ufunc.UFunc.time_ufunc_types('left_shift')
- 127±6μs 120±0.1μs 0.94 bench_ufunc.UFunc.time_ufunc_types('deg2rad')
- 10.9±0.5μs 10.3±0.01μs 0.94 bench_function_base.Sort.time_sort('merge', 'int64', ('reversed',))
- 115±5μs 108±0.2μs 0.94 bench_function_base.Bincount.time_bincount
- 17.0±0.4μs 15.9±0.03μs 0.94 bench_ufunc.UFunc.time_ufunc_types('right_shift')
- 797±30ns 743±0.5ns 0.93 bench_ufunc.ArgParsingReduce.time_add_reduce_arg_parsing((array([0., 1.]), axis=0))
- 18.4±1μs 17.2±0.04μs 0.93 bench_core.CountNonzero.time_count_nonzero_multi_axis(3, 10000, <class 'bool'>)
- 241±7μs 224±0.3μs 0.93 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'exp2'>, 2, 1, 'f')
- 105±1μs 96.7±0.02μs 0.92 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'deg2rad'>, 2, 4, 'f')
- 23.3±0.2μs 21.4±0.02μs 0.92 bench_lib.Pad.time_pad((1, 1, 1, 1, 1), 1, 'edge')
- 833±20μs 766±2μs 0.92 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'arctanh'>, 1, 1, 'd')
- 86.8±4μs 79.5±0.4μs 0.92 bench_ufunc.UFunc.time_ufunc_types('conjugate')
- 2.58±0.1μs 2.36±0μs 0.91 bench_ufunc.CustomScalar.time_divide_scalar2(<class 'numpy.float32'>)
- 102±4μs 92.8±0.7μs 0.91 bench_ufunc.UFunc.time_ufunc_types('logical_not')
- 46.6±0.4μs 42.1±0.07μs 0.90 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'absolute'>, 4, 1, 'd')
- 158±0.7μs 142±0.07μs 0.90 bench_lib.Pad.time_pad((4, 4, 4, 4), 1, 'linear_ramp')
- 729±6μs 657±1μs 0.90 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'arccos'>, 4, 4, 'f')
- 63.6±0.9μs 56.2±1μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'd')
- 730±40μs 605±3μs 0.83 bench_lib.Pad.time_pad((1024, 1024), 1, 'reflect')
SOME BENCHMARKS HAVE CHANGED SIGNIFICANTLY.
PERFORMANCE DECREASED.
```
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/umath/loops_minmax.dispatch.c.src | 68 |
1 files changed, 53 insertions, 15 deletions
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src index c2eddcee6..891be445e 100644 --- a/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -1,6 +1,8 @@ /*@targets ** $maxopt baseline ** neon asimd + ** sse2 avx2 avx512_skx + ** vsx2 **/ #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -20,7 +22,6 @@ #define scalar_max_i(A, B) ((A > B) ? A : B) #define scalar_min_i(A, B) ((A < B) ? A : B) // fp, propagates NaNs -#if !defined(__aarch64__) #define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B) #define scalar_max_d scalar_max_f #define scalar_max_l scalar_max_f @@ -34,28 +35,61 @@ #define scalar_minp_f fminf #define scalar_minp_d fmin #define scalar_minp_l fminl -#elif defined(__aarch64__) + +// special optimization for fp scalars propagates NaNs +// since there're no C99 support for it +#ifndef NPY_DISABLE_OPTIMIZATION /**begin repeat - * #type = npy_float, npy_double, npy_longdouble# - * #scalar_sfx = f, d, l# - * #asm_sfx = s, d, d# + * #type = npy_float, npy_double# + * #sfx = f32, f64# + * #c_sfx = f, d# + * #isa_sfx = s, d# + * #sse_type = __m128, __m128d# */ /**begin repeat1 - * #op = max, min, maxp, minp# - * #asm_instr = fmax, fmin, fmaxnm, fminnm# + * #op = max, min# + * #neon_instr = fmax, fmin# */ -static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ +#ifdef NPY_HAVE_SSE2 +#undef scalar_@op@_@c_sfx@ +NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { + @sse_type@ va = _mm_set_s@isa_sfx@(a); + @sse_type@ vb = _mm_set_s@isa_sfx@(b); + @sse_type@ rv = _mm_@op@_s@isa_sfx@(va, vb); + // X86 handel second operand + @sse_type@ nn = _mm_cmpord_s@isa_sfx@(va, va); + #ifdef NPY_HAVE_SSE41 + rv = _mm_blendv_p@isa_sfx@(va, rv, nn); + #else + rv = _mm_xor_p@isa_sfx@(va, _mm_and_p@isa_sfx@(_mm_xor_p@isa_sfx@(va, rv), nn)); + #endif + return _mm_cvts@isa_sfx@_@sfx@(rv); +} +#endif // SSE2 +#ifdef __aarch64__ +#undef scalar_@op@_@c_sfx@ +NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { @type@ result = 0; __asm( - "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]" + "@neon_instr@ %@isa_sfx@[result], %@isa_sfx@[a], %@isa_sfx@[b]" : [result] "=w" (result) : [a] "w" (a), [b] "w" (b) ); return result; } +#endif // __aarch64__ /**end repeat1**/ /**end repeat**/ -#endif // !defined(__aarch64__) +#endif // NPY_DISABLE_OPTIMIZATION +// mapping to double if its possible +#if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE +/**begin repeat + * #op = max, min, maxp, minp# + */ + #undef scalar_@op@_l + #define scalar_@op@_l scalar_@op@_d +/**end repeat**/ +#endif /******************************************************************************* ** extra SIMD intrinsics @@ -92,7 +126,7 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ */ NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) { - npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@]; npyv_storea_@sfx@(s, v); npyv_lanetype_@sfx@ result = s[0]; for(int i=1; i<npyv_nlanes_@sfx@; ++i){ @@ -127,7 +161,11 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b) { npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b); + // result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b); + // X86 handle second operand + #ifndef NPY_HAVE_SSE2 result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b); + #endif result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a); return result; } @@ -138,7 +176,7 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){ */ NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) { - npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@]; + npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@]; npyv_storea_@sfx@(s, v); npyv_lanetype_@sfx@ result = s[0]; for(int i=1; i<npyv_nlanes_@sfx@; ++i){ @@ -315,10 +353,10 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety * FLOAT, DOUBLE, LONGDOUBLE# * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * float, double, long double# + * npy_float, npy_double, npy_longdouble# * * #is_fp = 0*10, 1*3# - * #is_unsigend = 1*5, 0*5, 0*3# + * #is_unsigned = 1*5, 0*5, 0*3# * #scalar_sfx = i*10, f, d, l# */ #undef TO_SIMD_SFX @@ -332,7 +370,7 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 #undef TO_SIMD_SFX #endif - #elif @is_unsigend@ + #elif @is_unsigned@ #define TO_SIMD_SFX(X) X##_u@len@ #else #define TO_SIMD_SFX(X) X##_s@len@ |