Integrate requested changes, improve scalar operations, address linux aarch64

We've incorporated the changes you've requested for scalar operations. **Testing** - Apple silicon M1 native (arm64 / aarch64) -- No test failures - Apple silicon M1 Rosetta (x86_64) -- No new test failures - iMacPro1,1 (AVX512F) -- No test failures - Ubuntu VM (aarch64) -- No test failures **Benchmarks** Again, Apple silicon M1 native (arm64 / aarch64) looks similar to original patch (comparison below) Also, x86_64 (both Apple silicon M1 Rosetta and iMacPro1,1 AVX512F) have varying results. Some are better. Some are worse. Compared to previous re-org, we see improvements though. Apple silicon M1 native (arm64 / aarch64) comparison to previous commit: ``` before after ratio [8b01e839] [18565b27] <gh-issue-17989/feedback/round-1> <gh-issue-17989/feedback/round-2> + 176±0.2μs 196±1μs 1.11 bench_function_base.Sort.time_sort('heap', 'int16', ('ordered',)) + 234±0.2μs 261±1μs 1.11 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'exp'>, 2, 4, 'f') + 43.4±0.4μs 48.3±0.4μs 1.11 bench_function_base.Sort.time_sort('quick', 'int64', ('uniform',)) + 22.5±0.1μs 25.1±0.3μs 1.11 bench_shape_base.Block2D.time_block2d((512, 512), 'uint8', (2, 2)) + 4.75±0.05μs 5.28±0.07μs 1.11 bench_ma.UFunc.time_scalar(True, True, 1000) + 224±0.2μs 248±0.9μs 1.11 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'exp2'>, 1, 1, 'f') + 233±0.5μs 258±1μs 1.11 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'exp'>, 4, 2, 'f') + 8.81±0.02μs 9.72±0.1μs 1.10 bench_shape_base.Block2D.time_block2d((32, 32), 'uint16', (2, 2)) + 8.71±0.1μs 9.58±0.3μs 1.10 bench_indexing.ScalarIndexing.time_assign_cast(2) + 96.2±0.03μs 105±3μs 1.09 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'fabs'>, 1, 1, 'd') + 20.2±0.1μs 22.0±0.5μs 1.09 bench_shape_base.Block.time_block_simple_row_wise(100) + 469±4μs 510±7μs 1.09 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'cos'>, 2, 1, 'd') + 43.9±0.02μs 46.4±2μs 1.06 bench_function_base.Median.time_odd_inplace + 4.75±0μs 5.02±0.2μs 1.06 bench_itemselection.Take.time_contiguous((2, 1000, 1), 'raise', 'int64') - 16.4±0.07μs 15.6±0.4μs 0.95 bench_ufunc.UFunc.time_ufunc_types('left_shift') - 127±6μs 120±0.1μs 0.94 bench_ufunc.UFunc.time_ufunc_types('deg2rad') - 10.9±0.5μs 10.3±0.01μs 0.94 bench_function_base.Sort.time_sort('merge', 'int64', ('reversed',)) - 115±5μs 108±0.2μs 0.94 bench_function_base.Bincount.time_bincount - 17.0±0.4μs 15.9±0.03μs 0.94 bench_ufunc.UFunc.time_ufunc_types('right_shift') - 797±30ns 743±0.5ns 0.93 bench_ufunc.ArgParsingReduce.time_add_reduce_arg_parsing((array([0., 1.]), axis=0)) - 18.4±1μs 17.2±0.04μs 0.93 bench_core.CountNonzero.time_count_nonzero_multi_axis(3, 10000, <class 'bool'>) - 241±7μs 224±0.3μs 0.93 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'exp2'>, 2, 1, 'f') - 105±1μs 96.7±0.02μs 0.92 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'deg2rad'>, 2, 4, 'f') - 23.3±0.2μs 21.4±0.02μs 0.92 bench_lib.Pad.time_pad((1, 1, 1, 1, 1), 1, 'edge') - 833±20μs 766±2μs 0.92 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'arctanh'>, 1, 1, 'd') - 86.8±4μs 79.5±0.4μs 0.92 bench_ufunc.UFunc.time_ufunc_types('conjugate') - 2.58±0.1μs 2.36±0μs 0.91 bench_ufunc.CustomScalar.time_divide_scalar2(<class 'numpy.float32'>) - 102±4μs 92.8±0.7μs 0.91 bench_ufunc.UFunc.time_ufunc_types('logical_not') - 46.6±0.4μs 42.1±0.07μs 0.90 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'absolute'>, 4, 1, 'd') - 158±0.7μs 142±0.07μs 0.90 bench_lib.Pad.time_pad((4, 4, 4, 4), 1, 'linear_ramp') - 729±6μs 657±1μs 0.90 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'arccos'>, 4, 4, 'f') - 63.6±0.9μs 56.2±1μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'd') - 730±40μs 605±3μs 0.83 bench_lib.Pad.time_pad((1024, 1024), 1, 'reflect') SOME BENCHMARKS HAVE CHANGED SIGNIFICANTLY. PERFORMANCE DECREASED. ```
author: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> 2021-12-13 12:55:07 -0800
committer: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> 2021-12-13 12:55:07 -0800
commit: 0d171288ecd9ccfee739b15faa58de8243ae4a53 (patch)
tree: 72facb85b26d83a7c89172e0f43cdb74218ab362 /numpy
parent: 9fe353e048bc317099bb13cae4cb5a4de8c0c656 (diff)
download: numpy-0d171288ecd9ccfee739b15faa58de8243ae4a53.tar.gz
1 files changed, 53 insertions, 15 deletions
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index c2eddcee6..891be445e 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -1,6 +1,8 @@
 /*@targets
  ** $maxopt baseline
  ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
@@ -20,7 +22,6 @@
 #define scalar_max_i(A, B) ((A > B) ? A : B)
 #define scalar_min_i(A, B) ((A < B) ? A : B)
 // fp, propagates NaNs
-#if !defined(__aarch64__)
 #define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B)
 #define scalar_max_d scalar_max_f
 #define scalar_max_l scalar_max_f
@@ -34,28 +35,61 @@
 #define scalar_minp_f fminf
 #define scalar_minp_d fmin
 #define scalar_minp_l fminl
-#elif defined(__aarch64__)
+
+// special optimization for fp scalars propagates NaNs
+// since there're no C99 support for it
+#ifndef NPY_DISABLE_OPTIMIZATION
 /**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #scalar_sfx = f, d, l#
- * #asm_sfx = s, d, d#
+ * #type = npy_float, npy_double#
+ * #sfx = f32, f64#
+ * #c_sfx = f, d#
+ * #isa_sfx = s, d#
+ * #sse_type = __m128, __m128d#
  */
 /**begin repeat1
- * #op = max, min, maxp, minp#
- * #asm_instr = fmax, fmin, fmaxnm, fminnm#
+ * #op = max, min#
+ * #neon_instr = fmax, fmin#
  */
-static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
+#ifdef NPY_HAVE_SSE2
+#undef scalar_@op@_@c_sfx@
+NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
+    @sse_type@ va = _mm_set_s@isa_sfx@(a);
+    @sse_type@ vb = _mm_set_s@isa_sfx@(b);
+    @sse_type@ rv = _mm_@op@_s@isa_sfx@(va, vb);
+    // X86 handel second operand
+    @sse_type@ nn = _mm_cmpord_s@isa_sfx@(va, va);
+    #ifdef NPY_HAVE_SSE41
+    rv = _mm_blendv_p@isa_sfx@(va, rv, nn);
+    #else
+    rv = _mm_xor_p@isa_sfx@(va, _mm_and_p@isa_sfx@(_mm_xor_p@isa_sfx@(va, rv), nn));
+    #endif
+    return _mm_cvts@isa_sfx@_@sfx@(rv);
+}
+#endif // SSE2
+#ifdef __aarch64__
+#undef scalar_@op@_@c_sfx@
+NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
     @type@ result = 0;
     __asm(
-        "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]"
+        "@neon_instr@ %@isa_sfx@[result], %@isa_sfx@[a], %@isa_sfx@[b]"
         : [result] "=w" (result)
         : [a] "w" (a), [b] "w" (b)
     );
     return result;
 }
+#endif // __aarch64__
 /**end repeat1**/
 /**end repeat**/
-#endif // !defined(__aarch64__)
+#endif // NPY_DISABLE_OPTIMIZATION
+// mapping to double if its possible
+#if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE
+/**begin repeat
+ * #op = max, min, maxp, minp#
+ */
+    #undef scalar_@op@_l
+    #define scalar_@op@_l scalar_@op@_d
+/**end repeat**/
+#endif
 
 /*******************************************************************************
  ** extra SIMD intrinsics
@@ -92,7 +126,7 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
      */
     NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
     {
-        npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@];
+        npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@];
         npyv_storea_@sfx@(s, v);
         npyv_lanetype_@sfx@ result = s[0];
         for(int i=1; i<npyv_nlanes_@sfx@; ++i){
@@ -127,7 +161,11 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
     NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b)
     {
         npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b);
+        // result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
+     // X86 handle second operand
+    #ifndef NPY_HAVE_SSE2
         result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
+    #endif
         result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a);
         return result;
     }
@@ -138,7 +176,7 @@ static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
      */
     NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
     {
-        npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@];
+        npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@];
         npyv_storea_@sfx@(s, v);
         npyv_lanetype_@sfx@ result = s[0];
         for(int i=1; i<npyv_nlanes_@sfx@; ++i){
@@ -315,10 +353,10 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety
  *          FLOAT, DOUBLE, LONGDOUBLE#
  * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *         float, double, long double#
+ *         npy_float, npy_double, npy_longdouble#
  *
  * #is_fp = 0*10, 1*3#
- * #is_unsigend = 1*5, 0*5, 0*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
  * #scalar_sfx = i*10, f, d, l#
  */
 #undef TO_SIMD_SFX
@@ -332,7 +370,7 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety
         #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
             #undef TO_SIMD_SFX
         #endif
-    #elif @is_unsigend@
+    #elif @is_unsigned@
         #define TO_SIMD_SFX(X) X##_u@len@
     #else
         #define TO_SIMD_SFX(X) X##_s@len@
author	Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>	2021-12-13 12:55:07 -0800
committer	Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>	2021-12-13 12:55:07 -0800
commit	0d171288ecd9ccfee739b15faa58de8243ae4a53 (patch)
tree	72facb85b26d83a7c89172e0f43cdb74218ab362 /numpy
parent	9fe353e048bc317099bb13cae4cb5a4de8c0c656 (diff)
download	numpy-0d171288ecd9ccfee739b15faa58de8243ae4a53.tar.gz