diff options
author | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2021-10-18 14:04:17 -0700 |
---|---|---|
committer | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2021-10-18 14:04:17 -0700 |
commit | 1950fb55ce21f2e125672979872dd4c7781d18ce (patch) | |
tree | 75a8dfdd914c6e1ae6a531f8d7a54082fca92f1c /numpy/array_api/_set_functions.py | |
parent | dd2eaaabdb0451631e95376ae2b4d319082b438e (diff) | |
download | numpy-1950fb55ce21f2e125672979872dd4c7781d18ce.tar.gz |
BUG: NEON min/max is slow (#17989)
This fixes numpy/numpy#17989 by adding ARM NEON implementations for min/max and fmin/max.
Before: Rosetta faster than native arm64 by `1.2x - 8.6x`.
After: Native arm64 faster than Rosetta by `1.6x - 6.7x`. (2.8x - 15.5x improvement)
**Benchmarks**
```
before after ratio
[b0e1a445] [8301ffd7]
<main> <gh-issue-17989/improve-neon-min-max>
+ 32.6±0.04μs 37.5±0.08μs 1.15 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 1, 'd')
+ 32.6±0.06μs 37.5±0.04μs 1.15 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 1, 'd')
+ 37.8±0.09μs 43.2±0.09μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 4, 'f')
+ 37.7±0.09μs 42.9±0.1μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 2, 'd')
+ 37.9±0.2μs 43.0±0.02μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 2, 'd')
+ 37.7±0.01μs 42.3±1μs 1.12 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 2, 2, 'd')
+ 34.2±0.07μs 38.1±0.05μs 1.12 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 2, 'f')
+ 32.6±0.03μs 35.8±0.04μs 1.10 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 1, 'f')
+ 37.1±0.1μs 40.3±0.1μs 1.09 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 1, 2, 'd')
+ 37.2±0.1μs 40.3±0.04μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 4, 'f')
+ 37.1±0.09μs 40.3±0.07μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 1, 2, 'd')
+ 68.6±0.5μs 74.2±0.3μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 4, 4, 'd')
+ 37.1±0.2μs 40.0±0.1μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 1, 2, 'd')
+ 2.42±0μs 2.61±0.05μs 1.08 bench_core.CountNonzero.time_count_nonzero_axis(3, 100, <class 'numpy.int16'>)
+ 69.1±0.7μs 73.5±0.7μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 4, 4, 'd')
+ 54.7±0.3μs 58.0±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'd')
+ 54.5±0.2μs 57.8±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 2, 4, 'd')
+ 3.78±0.04μs 4.00±0.02μs 1.06 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 100, <class 'str'>)
+ 54.8±0.2μs 57.9±0.3μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 4, 'd')
+ 3.68±0.01μs 3.87±0.02μs 1.05 bench_core.CountNonzero.time_count_nonzero_multi_axis(1, 100, <class 'object'>)
+ 69.6±0.2μs 73.1±0.2μs 1.05 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 4, 'd')
+ 229±2μs 241±0.2μs 1.05 bench_random.Bounded.time_bounded('PCG64', [<class 'numpy.uint64'>, 1535])
- 73.0±0.8μs 69.5±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 4, 4, 'd')
- 37.6±0.1μs 35.7±0.3μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 1, 4, 'f')
- 88.7±0.04μs 84.2±0.7μs 0.95 bench_lib.Pad.time_pad((256, 128, 1), 1, 'wrap')
- 57.9±0.2μs 54.8±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 2, 4, 'd')
- 39.9±0.2μs 37.2±0.04μs 0.93 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'positive'>, 1, 2, 'd')
- 2.66±0.01μs 2.47±0.01μs 0.93 bench_lib.Nan.time_nanmin(200, 0)
- 2.65±0.02μs 2.46±0.04μs 0.93 bench_lib.Nan.time_nanmin(200, 50.0)
- 2.64±0.01μs 2.45±0.01μs 0.93 bench_lib.Nan.time_nanmax(200, 90.0)
- 2.64±0μs 2.44±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0)
- 2.68±0.02μs 2.48±0μs 0.92 bench_lib.Nan.time_nanmax(200, 2.0)
- 40.2±0.01μs 37.1±0.1μs 0.92 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'f')
- 2.69±0μs 2.47±0μs 0.92 bench_lib.Nan.time_nanmin(200, 2.0)
- 2.70±0.02μs 2.48±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0.1)
- 2.70±0μs 2.47±0μs 0.91 bench_lib.Nan.time_nanmin(200, 90.0)
- 2.70±0μs 2.46±0μs 0.91 bench_lib.Nan.time_nanmin(200, 0.1)
- 2.70±0μs 2.42±0.01μs 0.90 bench_lib.Nan.time_nanmax(200, 50.0)
- 11.8±0.6ms 10.6±0.6ms 0.89 bench_core.CountNonzero.time_count_nonzero_axis(2, 1000000, <class 'str'>)
- 42.7±0.1μs 37.8±0.02μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'positive'>, 2, 2, 'd')
- 42.8±0.03μs 37.8±0.2μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'rint'>, 2, 2, 'd')
- 43.1±0.2μs 37.7±0.09μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 4, 4, 'f')
- 37.5±0.07μs 32.6±0.06μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'rint'>, 2, 1, 'd')
- 41.7±0.03μs 36.3±0.07μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc '_ones_like'>, 1, 4, 'd')
- 166±0.8μs 144±1μs 0.87 bench_ufunc.UFunc.time_ufunc_types('fmin')
- 11.6±0.8ms 10.0±0.01ms 0.87 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 1000000, <class 'str'>)
- 167±0.9μs 144±2μs 0.86 bench_ufunc.UFunc.time_ufunc_types('minimum')
- 168±4μs 143±0.5μs 0.85 bench_ufunc.UFunc.time_ufunc_types('fmax')
- 167±1μs 142±0.8μs 0.85 bench_ufunc.UFunc.time_ufunc_types('maximum')
- 7.10±0μs 4.97±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 2)
- 7.11±0.07μs 4.96±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 2)
- 7.05±0.07μs 4.68±0μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 4)
- 7.13±0μs 4.68±0.01μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 4)
- 461±0.2μs 297±7μs 0.64 bench_app.MaxesOfDots.time_it
- 7.04±0.07μs 3.95±0μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 2)
- 7.06±0.06μs 3.95±0.01μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 2)
- 7.09±0.06μs 3.24±0μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 1)
- 7.12±0.07μs 3.25±0.02μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 1)
- 14.5±0.02μs 3.98±0μs 0.27 bench_reduce.MinMax.time_max(<class 'numpy.int64'>)
- 14.6±0.1μs 4.00±0.01μs 0.27 bench_reduce.MinMax.time_min(<class 'numpy.int64'>)
- 6.88±0.06μs 1.34±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 1)
- 7.00±0μs 1.33±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 1)
- 39.4±0.01μs 3.95±0.01μs 0.10 bench_reduce.MinMax.time_min(<class 'numpy.float64'>)
- 39.4±0.01μs 3.95±0.02μs 0.10 bench_reduce.MinMax.time_max(<class 'numpy.float64'>)
- 254±0.02μs 22.8±0.2μs 0.09 bench_lib.Nan.time_nanmax(200000, 50.0)
- 253±0.1μs 22.7±0.1μs 0.09 bench_lib.Nan.time_nanmin(200000, 0)
- 254±0.06μs 22.7±0.09μs 0.09 bench_lib.Nan.time_nanmin(200000, 2.0)
- 254±0.01μs 22.7±0.03μs 0.09 bench_lib.Nan.time_nanmin(200000, 0.1)
- 254±0.04μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmin(200000, 50.0)
- 253±0.1μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 0.1)
- 253±0.03μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmin(200000, 90.0)
- 253±0.02μs 22.7±0.07μs 0.09 bench_lib.Nan.time_nanmax(200000, 0)
- 254±0.03μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmax(200000, 90.0)
- 254±0.09μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 2.0)
- 39.2±0.01μs 2.51±0.01μs 0.06 bench_reduce.MinMax.time_max(<class 'numpy.float32'>)
- 39.2±0.01μs 2.50±0.01μs 0.06 bench_reduce.MinMax.time_min(<class 'numpy.float32'>)
```
Size change of _multiarray_umath.cpython-39-darwin.so:
Before: 3,890,723
After: 3,924,035
Change: +33,312 (~ +0.856 %)
Diffstat (limited to 'numpy/array_api/_set_functions.py')
0 files changed, 0 insertions, 0 deletions