summaryrefslogtreecommitdiff
path: root/numpy/array_api/_set_functions.py
diff options
context:
space:
mode:
authorDeveloper-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>2021-10-18 14:04:17 -0700
committerDeveloper-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>2021-10-18 14:04:17 -0700
commit1950fb55ce21f2e125672979872dd4c7781d18ce (patch)
tree75a8dfdd914c6e1ae6a531f8d7a54082fca92f1c /numpy/array_api/_set_functions.py
parentdd2eaaabdb0451631e95376ae2b4d319082b438e (diff)
downloadnumpy-1950fb55ce21f2e125672979872dd4c7781d18ce.tar.gz
BUG: NEON min/max is slow (#17989)
This fixes numpy/numpy#17989 by adding ARM NEON implementations for min/max and fmin/max. Before: Rosetta faster than native arm64 by `1.2x - 8.6x`. After: Native arm64 faster than Rosetta by `1.6x - 6.7x`. (2.8x - 15.5x improvement) **Benchmarks** ``` before after ratio [b0e1a445] [8301ffd7] <main> <gh-issue-17989/improve-neon-min-max> + 32.6±0.04μs 37.5±0.08μs 1.15 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 1, 'd') + 32.6±0.06μs 37.5±0.04μs 1.15 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 1, 'd') + 37.8±0.09μs 43.2±0.09μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 4, 'f') + 37.7±0.09μs 42.9±0.1μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 2, 'd') + 37.9±0.2μs 43.0±0.02μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 2, 'd') + 37.7±0.01μs 42.3±1μs 1.12 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 2, 2, 'd') + 34.2±0.07μs 38.1±0.05μs 1.12 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 2, 'f') + 32.6±0.03μs 35.8±0.04μs 1.10 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 1, 'f') + 37.1±0.1μs 40.3±0.1μs 1.09 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 1, 2, 'd') + 37.2±0.1μs 40.3±0.04μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 4, 'f') + 37.1±0.09μs 40.3±0.07μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 1, 2, 'd') + 68.6±0.5μs 74.2±0.3μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 4, 4, 'd') + 37.1±0.2μs 40.0±0.1μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 1, 2, 'd') + 2.42±0μs 2.61±0.05μs 1.08 bench_core.CountNonzero.time_count_nonzero_axis(3, 100, <class 'numpy.int16'>) + 69.1±0.7μs 73.5±0.7μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 4, 4, 'd') + 54.7±0.3μs 58.0±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'd') + 54.5±0.2μs 57.8±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 2, 4, 'd') + 3.78±0.04μs 4.00±0.02μs 1.06 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 100, <class 'str'>) + 54.8±0.2μs 57.9±0.3μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 4, 'd') + 3.68±0.01μs 3.87±0.02μs 1.05 bench_core.CountNonzero.time_count_nonzero_multi_axis(1, 100, <class 'object'>) + 69.6±0.2μs 73.1±0.2μs 1.05 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 4, 'd') + 229±2μs 241±0.2μs 1.05 bench_random.Bounded.time_bounded('PCG64', [<class 'numpy.uint64'>, 1535]) - 73.0±0.8μs 69.5±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 4, 4, 'd') - 37.6±0.1μs 35.7±0.3μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 1, 4, 'f') - 88.7±0.04μs 84.2±0.7μs 0.95 bench_lib.Pad.time_pad((256, 128, 1), 1, 'wrap') - 57.9±0.2μs 54.8±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 2, 4, 'd') - 39.9±0.2μs 37.2±0.04μs 0.93 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'positive'>, 1, 2, 'd') - 2.66±0.01μs 2.47±0.01μs 0.93 bench_lib.Nan.time_nanmin(200, 0) - 2.65±0.02μs 2.46±0.04μs 0.93 bench_lib.Nan.time_nanmin(200, 50.0) - 2.64±0.01μs 2.45±0.01μs 0.93 bench_lib.Nan.time_nanmax(200, 90.0) - 2.64±0μs 2.44±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0) - 2.68±0.02μs 2.48±0μs 0.92 bench_lib.Nan.time_nanmax(200, 2.0) - 40.2±0.01μs 37.1±0.1μs 0.92 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'f') - 2.69±0μs 2.47±0μs 0.92 bench_lib.Nan.time_nanmin(200, 2.0) - 2.70±0.02μs 2.48±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0.1) - 2.70±0μs 2.47±0μs 0.91 bench_lib.Nan.time_nanmin(200, 90.0) - 2.70±0μs 2.46±0μs 0.91 bench_lib.Nan.time_nanmin(200, 0.1) - 2.70±0μs 2.42±0.01μs 0.90 bench_lib.Nan.time_nanmax(200, 50.0) - 11.8±0.6ms 10.6±0.6ms 0.89 bench_core.CountNonzero.time_count_nonzero_axis(2, 1000000, <class 'str'>) - 42.7±0.1μs 37.8±0.02μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'positive'>, 2, 2, 'd') - 42.8±0.03μs 37.8±0.2μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'rint'>, 2, 2, 'd') - 43.1±0.2μs 37.7±0.09μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 4, 4, 'f') - 37.5±0.07μs 32.6±0.06μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'rint'>, 2, 1, 'd') - 41.7±0.03μs 36.3±0.07μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc '_ones_like'>, 1, 4, 'd') - 166±0.8μs 144±1μs 0.87 bench_ufunc.UFunc.time_ufunc_types('fmin') - 11.6±0.8ms 10.0±0.01ms 0.87 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 1000000, <class 'str'>) - 167±0.9μs 144±2μs 0.86 bench_ufunc.UFunc.time_ufunc_types('minimum') - 168±4μs 143±0.5μs 0.85 bench_ufunc.UFunc.time_ufunc_types('fmax') - 167±1μs 142±0.8μs 0.85 bench_ufunc.UFunc.time_ufunc_types('maximum') - 7.10±0μs 4.97±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 2) - 7.11±0.07μs 4.96±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 2) - 7.05±0.07μs 4.68±0μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 4) - 7.13±0μs 4.68±0.01μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 4) - 461±0.2μs 297±7μs 0.64 bench_app.MaxesOfDots.time_it - 7.04±0.07μs 3.95±0μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 2) - 7.06±0.06μs 3.95±0.01μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 2) - 7.09±0.06μs 3.24±0μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 1) - 7.12±0.07μs 3.25±0.02μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 1) - 14.5±0.02μs 3.98±0μs 0.27 bench_reduce.MinMax.time_max(<class 'numpy.int64'>) - 14.6±0.1μs 4.00±0.01μs 0.27 bench_reduce.MinMax.time_min(<class 'numpy.int64'>) - 6.88±0.06μs 1.34±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 1) - 7.00±0μs 1.33±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 1) - 39.4±0.01μs 3.95±0.01μs 0.10 bench_reduce.MinMax.time_min(<class 'numpy.float64'>) - 39.4±0.01μs 3.95±0.02μs 0.10 bench_reduce.MinMax.time_max(<class 'numpy.float64'>) - 254±0.02μs 22.8±0.2μs 0.09 bench_lib.Nan.time_nanmax(200000, 50.0) - 253±0.1μs 22.7±0.1μs 0.09 bench_lib.Nan.time_nanmin(200000, 0) - 254±0.06μs 22.7±0.09μs 0.09 bench_lib.Nan.time_nanmin(200000, 2.0) - 254±0.01μs 22.7±0.03μs 0.09 bench_lib.Nan.time_nanmin(200000, 0.1) - 254±0.04μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmin(200000, 50.0) - 253±0.1μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 0.1) - 253±0.03μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmin(200000, 90.0) - 253±0.02μs 22.7±0.07μs 0.09 bench_lib.Nan.time_nanmax(200000, 0) - 254±0.03μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmax(200000, 90.0) - 254±0.09μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 2.0) - 39.2±0.01μs 2.51±0.01μs 0.06 bench_reduce.MinMax.time_max(<class 'numpy.float32'>) - 39.2±0.01μs 2.50±0.01μs 0.06 bench_reduce.MinMax.time_min(<class 'numpy.float32'>) ``` Size change of _multiarray_umath.cpython-39-darwin.so: Before: 3,890,723 After: 3,924,035 Change: +33,312 (~ +0.856 %)
Diffstat (limited to 'numpy/array_api/_set_functions.py')
0 files changed, 0 insertions, 0 deletions