summaryrefslogtreecommitdiff
path: root/numpy/array_api/_array_object.py
diff options
context:
space:
mode:
authorDeveloper-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>2022-08-08 10:40:28 -0700
committerDeveloper-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>2022-08-08 10:40:28 -0700
commiteeba168c30dcb0a79906f293e85305150d965810 (patch)
tree3f70092eec2ebb5568295c41ac2518618f807898 /numpy/array_api/_array_object.py
parentc652fcbd9c7d651780ea56f078c8609932822cf7 (diff)
downloadnumpy-eeba168c30dcb0a79906f293e85305150d965810.tar.gz
PERF: Improve intrinsics for tobits and pack on Apple silicon
Improvements: - up to 1.25x faster on pack_bits() - up to 1.63x faster for comparison loops Apple M1 native (arm64): ``` before after ratio [da6297b9] [1d330fc2] <main> <tobits-pack-intrinsics/upstream-pr> + 67.2±0.6μs 74.9±6μs 1.11 bench_indexing.Indexing.time_op('indexes_', 'I', '') + 38.6±0.06μs 42.8±1μs 1.11 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'invert'>, 4, 4, 'i') + 7.68±0.02μs 8.47±0.01μs 1.10 bench_function_base.Sort.time_argsort('merge', 'uint32', ('uniform',)) - 42.5±0.3μs 38.6±0.03μs 0.91 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'invert'>, 2, 2, 'l') - 147±1μs 133±1μs 0.91 bench_function_base.Histogram1D.time_small_coverage - 4.62±0.04μs 4.17±0.04μs 0.90 bench_ufunc_strides.AVX_cmplx_funcs.time_ufunc('conjugate', 1, 'D') - 1.11±0μs 992±2ns 0.90 bench_core.PackBits.time_packbits_little(<class 'bool'>) - 2.60±0.01μs 2.33±0μs 0.90 bench_itemselection.Take.time_contiguous((1000, 1), 'raise', 'float16') - 2.89±0μs 2.59±0μs 0.90 bench_itemselection.Take.time_contiguous((1000, 2), 'raise', 'int32') - 2.90±0μs 2.59±0μs 0.89 bench_itemselection.Take.time_contiguous((1000, 1), 'raise', 'complex64') - 2.90±0μs 2.59±0μs 0.89 bench_itemselection.Take.time_contiguous((1000, 1), 'raise', 'float64') - 2.90±0μs 2.58±0μs 0.89 bench_itemselection.Take.time_contiguous((1000, 1), 'raise', 'int64') - 2.59±0μs 2.31±0.02μs 0.89 bench_itemselection.Take.time_contiguous((1000, 1), 'raise', 'int16') - 2.90±0μs 2.58±0μs 0.89 bench_itemselection.Take.time_contiguous((1000, 2), 'raise', 'float32') - 2.90±0μs 2.58±0μs 0.89 bench_itemselection.Take.time_contiguous((1000, 1), 'raise', 'longfloat') - 4.78±0μs 4.25±0μs 0.89 bench_itemselection.Take.time_contiguous((2, 1000, 1), 'raise', 'int16') - 4.78±0μs 4.25±0μs 0.89 bench_itemselection.Take.time_contiguous((2, 1000, 1), 'raise', 'float16') - 5.41±0μs 4.79±0μs 0.89 bench_itemselection.Take.time_contiguous((2, 1000, 1), 'raise', 'float64') - 5.41±0μs 4.78±0μs 0.88 bench_itemselection.Take.time_contiguous((2, 1000, 1), 'raise', 'complex64') - 5.41±0μs 4.78±0μs 0.88 bench_itemselection.Take.time_contiguous((2, 1000, 1), 'raise', 'longfloat') - 5.41±0μs 4.78±0μs 0.88 bench_itemselection.Take.time_contiguous((2, 1000, 1), 'raise', 'int64') - 10.8±0.02μs 9.42±0.03μs 0.87 bench_function_base.Sort.time_argsort('merge', 'uint32', ('reversed',)) - 90.0±0.08μs 78.4±0.4μs 0.87 bench_function_base.Sort.time_argsort('quick', 'int32', ('reversed',)) - 2.53±0.02μs 2.20±0.02μs 0.87 bench_ufunc.CustomComparison.time_less_than_scalar2(<class 'numpy.uint16'>) - 2.53±0.01μs 2.18±0μs 0.86 bench_ufunc.CustomComparison.time_less_than_scalar1(<class 'numpy.uint16'>) - 2.54±0.02μs 2.18±0.01μs 0.86 bench_ufunc.CustomComparison.time_less_than_scalar1(<class 'numpy.int16'>) - 2.54±0.02μs 2.16±0.02μs 0.85 bench_ufunc.CustomComparison.time_less_than_scalar2(<class 'numpy.int16'>) - 10.8±0.1μs 8.69±0.01μs 0.81 bench_core.PackBits.time_packbits_axis1(<class 'bool'>) - 57.4±10μs 44.5±0.02μs 0.77 bench_core.CountNonzero.time_count_nonzero(1, 10000, <class 'str'>) - 60.9±0.01μs 46.7±0.03μs 0.77 bench_function_base.Sort.time_argsort('quick', 'int32', ('ordered',)) - 5.41±1μs 4.15±0μs 0.77 bench_itemselection.Take.time_contiguous((1000, 2), 'wrap', 'complex128') - 4.77±0.01μs 3.51±0.01μs 0.74 bench_ufunc.CustomComparison.time_less_than_scalar1(<class 'numpy.int32'>) - 4.84±0.03μs 3.51±0.02μs 0.72 bench_ufunc.CustomComparison.time_less_than_scalar2(<class 'numpy.uint32'>) - 4.87±0.01μs 3.50±0.03μs 0.72 bench_ufunc.CustomComparison.time_less_than_scalar2(<class 'numpy.float32'>) - 4.86±0μs 3.50±0.01μs 0.72 bench_ufunc.CustomComparison.time_less_than_scalar1(<class 'numpy.uint32'>) - 4.82±0.06μs 3.46±0.04μs 0.72 bench_ufunc.CustomComparison.time_less_than_scalar2(<class 'numpy.int32'>) - 4.86±0.01μs 3.48±0.03μs 0.72 bench_ufunc.CustomComparison.time_less_than_scalar1(<class 'numpy.float32'>) - 9.26±0.07μs 5.73±0.03μs 0.62 bench_ufunc.CustomComparison.time_less_than_scalar2(<class 'numpy.int64'>) - 9.34±0μs 5.76±0.02μs 0.62 bench_ufunc.CustomComparison.time_less_than_scalar1(<class 'numpy.uint64'>) - 9.34±0.07μs 5.75±0.02μs 0.62 bench_ufunc.CustomComparison.time_less_than_scalar1(<class 'numpy.float64'>) - 9.31±0.04μs 5.73±0.04μs 0.61 bench_ufunc.CustomComparison.time_less_than_scalar2(<class 'numpy.float64'>) - 9.28±0.09μs 5.67±0.04μs 0.61 bench_ufunc.CustomComparison.time_less_than_scalar1(<class 'numpy.int64'>) - 9.18±0.02μs 5.60±0.02μs 0.61 bench_ufunc.CustomComparison.time_less_than_scalar2(<class 'numpy.uint64'>) ```
Diffstat (limited to 'numpy/array_api/_array_object.py')
0 files changed, 0 insertions, 0 deletions