/*@targets ** $maxopt baseline ** neon asimd ** sse2 avx2 avx512_skx ** vsx2 ** vx vxe **/ #define _UMATHMODULE #define _MULTIARRAYMODULE #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" // Provides the various *_LOOP macros #include "fast_loop_macros.h" /******************************************************************************* ** Scalar intrinsics ******************************************************************************/ // signed/unsigned int #define scalar_max_i(A, B) ((A > B) ? A : B) #define scalar_min_i(A, B) ((A < B) ? A : B) // fp, propagates NaNs #define scalar_max(A, B) ((A >= B || npy_isnan(A)) ? A : B) #define scalar_max_f scalar_max #define scalar_max_d scalar_max #define scalar_max_l scalar_max #define scalar_min(A, B) ((A <= B || npy_isnan(A)) ? A : B) #define scalar_min_f scalar_min #define scalar_min_d scalar_min #define scalar_min_l scalar_min // fp, ignores NaNs #define scalar_maxp_f fmaxf #define scalar_maxp_d fmax #define scalar_maxp_l fmaxl #define scalar_minp_f fminf #define scalar_minp_d fmin #define scalar_minp_l fminl // special optimization for fp scalars propagates NaNs // since there're no C99 support for it #ifndef NPY_DISABLE_OPTIMIZATION /**begin repeat * #type = npy_float, npy_double# * #sfx = f32, f64# * #c_sfx = f, d# * #isa_sfx = s, d# * #sse_type = __m128, __m128d# */ /**begin repeat1 * #op = max, min# * #neon_instr = fmax, fmin# */ #ifdef NPY_HAVE_SSE2 #undef scalar_@op@_@c_sfx@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { @sse_type@ va = _mm_set_s@isa_sfx@(a); @sse_type@ vb = _mm_set_s@isa_sfx@(b); @sse_type@ rv = _mm_@op@_s@isa_sfx@(va, vb); // X86 handle second operand @sse_type@ nn = _mm_cmpord_s@isa_sfx@(va, va); #ifdef NPY_HAVE_SSE41 rv = _mm_blendv_p@isa_sfx@(va, rv, nn); #else rv = _mm_xor_p@isa_sfx@(va, _mm_and_p@isa_sfx@(_mm_xor_p@isa_sfx@(va, rv), nn)); #endif return _mm_cvts@isa_sfx@_@sfx@(rv); } #endif // SSE2 #ifdef __aarch64__ #undef scalar_@op@_@c_sfx@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { @type@ result = 0; __asm( "@neon_instr@ %@isa_sfx@[result], %@isa_sfx@[a], %@isa_sfx@[b]" : [result] "=w" (result) : [a] "w" (a), [b] "w" (b) ); return result; } #endif // __aarch64__ /**end repeat1**/ /**end repeat**/ #endif // NPY_DISABLE_OPTIMIZATION // mapping to double if its possible #if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE /**begin repeat * #op = max, min, maxp, minp# */ #undef scalar_@op@_l #define scalar_@op@_l scalar_@op@_d /**end repeat**/ #endif /******************************************************************************* ** Defining the SIMD kernels ******************************************************************************/ /**begin repeat * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64# * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64# * #is_fp = 0*8, 1, 1# * #scalar_sfx = i*8, f, d# */ /**begin repeat1 * # intrin = max, min, maxp, minp# * # fp_only = 0, 0, 1, 1# */ #define SCALAR_OP scalar_@intrin@_@scalar_sfx@ #if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@)) #if @is_fp@ && !@fp_only@ #define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs #define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@ #else #define V_INTRIN npyv_@intrin@_@sfx@ #define V_REDUCE_INTRIN npyv_reduce_@intrin@_@sfx@ #endif // contiguous input. static inline void simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len) { if (len < 1) { return; } const int vstep = npyv_nlanes_@sfx@; const int wstep = vstep*8; npyv_@sfx@ acc = npyv_setall_@sfx@(op1[0]); for (; len >= wstep; len -= wstep, ip += wstep) { #ifdef NPY_HAVE_SSE2 NPY_PREFETCH(ip + wstep, 0, 3); #endif npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0); npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1); npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2); npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3); npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4); npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5); npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6); npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7); npyv_@sfx@ r01 = V_INTRIN(v0, v1); npyv_@sfx@ r23 = V_INTRIN(v2, v3); npyv_@sfx@ r45 = V_INTRIN(v4, v5); npyv_@sfx@ r67 = V_INTRIN(v6, v7); acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67))); } for (; len >= vstep; len -= vstep, ip += vstep) { acc = V_INTRIN(acc, npyv_load_@sfx@(ip)); } npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(acc); // Scalar - finish up any remaining iterations for (; len > 0; --len, ++ip) { const npyv_lanetype_@sfx@ in2 = *ip; r = SCALAR_OP(r, in2); } op1[0] = r; } // contiguous inputs and output. static inline void simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2, npyv_lanetype_@sfx@ *op1, npy_intp len) { #if NPY_SIMD_WIDTH == 128 // Note, 6x unroll was chosen for best results on Apple M1 const int vectorsPerLoop = 6; #else // To avoid memory bandwidth bottleneck const int vectorsPerLoop = 2; #endif const int elemPerVector = npyv_nlanes_@sfx@; int elemPerLoop = vectorsPerLoop * elemPerVector; npy_intp i = 0; for (; (i+elemPerLoop) <= len; i += elemPerLoop) { npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]); npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]); #if NPY_SIMD_WIDTH == 128 npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]); npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]); npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]); npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]); #endif npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); #if NPY_SIMD_WIDTH == 128 npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); #endif npyv_@sfx@ m0 = V_INTRIN(v0, u0); npyv_@sfx@ m1 = V_INTRIN(v1, u1); #if NPY_SIMD_WIDTH == 128 npyv_@sfx@ m2 = V_INTRIN(v2, u2); npyv_@sfx@ m3 = V_INTRIN(v3, u3); npyv_@sfx@ m4 = V_INTRIN(v4, u4); npyv_@sfx@ m5 = V_INTRIN(v5, u5); #endif npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0); npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1); #if NPY_SIMD_WIDTH == 128 npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2); npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3); npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4); npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5); #endif } for (; (i+elemPerVector) <= len; i += elemPerVector) { npyv_@sfx@ v0 = npyv_load_@sfx@(ip1 + i); npyv_@sfx@ u0 = npyv_load_@sfx@(ip2 + i); npyv_@sfx@ m0 = V_INTRIN(v0, u0); npyv_store_@sfx@(op1 + i, m0); } // Scalar - finish up any remaining iterations for (; i < len; ++i) { const npyv_lanetype_@sfx@ in1 = ip1[i]; const npyv_lanetype_@sfx@ in2 = ip2[i]; op1[i] = SCALAR_OP(in1, in2); } } // non-contiguous for float 32/64-bit memory access #if @is_fp@ static inline void simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1, const npyv_lanetype_@sfx@ *ip2, npy_intp sip2, npyv_lanetype_@sfx@ *op1, npy_intp sop1, npy_intp len) { const int vstep = npyv_nlanes_@sfx@; for (; len >= vstep; len -= vstep, ip1 += sip1*vstep, ip2 += sip2*vstep, op1 += sop1*vstep ) { npyv_@sfx@ a, b; if (sip1 == 1) { a = npyv_load_@sfx@(ip1); } else { a = npyv_loadn_@sfx@(ip1, sip1); } if (sip2 == 1) { b = npyv_load_@sfx@(ip2); } else { b = npyv_loadn_@sfx@(ip2, sip2); } npyv_@sfx@ r = V_INTRIN(a, b); if (sop1 == 1) { npyv_store_@sfx@(op1, r); } else { npyv_storen_@sfx@(op1, sop1, r); } } for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) { const npyv_lanetype_@sfx@ a = *ip1; const npyv_lanetype_@sfx@ b = *ip2; *op1 = SCALAR_OP(a, b); } } #endif #undef V_INTRIN #undef V_REDUCE_INTRIN #endif // simd_chk && (!fp_only || (is_fp && fp_only)) #undef SCALAR_OP /**end repeat1**/ /**end repeat**/ /******************************************************************************* ** Defining ufunc inner functions ******************************************************************************/ /**begin repeat * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, * BYTE, SHORT, INT, LONG, LONGLONG, * FLOAT, DOUBLE, LONGDOUBLE# * * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG, * BYTE, SHORT, INT, LONG, LONGLONG, * FLOAT, DOUBLE, LONGDOUBLE# * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_byte, npy_short, npy_int, npy_long, npy_longlong, * npy_float, npy_double, npy_longdouble# * * #is_fp = 0*10, 1*3# * #is_unsigned = 1*5, 0*5, 0*3# * #scalar_sfx = i*10, f, d, l# */ #undef TO_SIMD_SFX #if 0 /**begin repeat1 * #len = 8, 16, 32, 64# */ #elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ #if @is_fp@ #define TO_SIMD_SFX(X) X##_f@len@ #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32 #undef TO_SIMD_SFX #endif #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 #undef TO_SIMD_SFX #endif #elif @is_unsigned@ #define TO_SIMD_SFX(X) X##_u@len@ #else #define TO_SIMD_SFX(X) X##_s@len@ #endif /**end repeat1**/ #endif /**begin repeat1 * # kind = maximum, minimum, fmax, fmin# * # intrin = max, min, maxp, minp# * # fp_only = 0, 0, 1, 1# */ #if !@fp_only@ || (@is_fp@ && @fp_only@) #define SCALAR_OP scalar_@intrin@_@scalar_sfx@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], len = dimensions[0]; npy_intp i = 0; #ifdef TO_SIMD_SFX #undef STYPE #define STYPE TO_SIMD_SFX(npyv_lanetype) if (IS_BINARY_REDUCE) { // reduce and contiguous if (is2 == sizeof(@type@)) { TO_SIMD_SFX(simd_reduce_c_@intrin@)( (STYPE*)ip2, (STYPE*)op1, len ); goto clear_fp; } } else if (!is_mem_overlap(ip1, is1, op1, os1, len) && !is_mem_overlap(ip2, is2, op1, os1, len) ) { // no overlap and operands are binary contiguous if (IS_BINARY_CONT(@type@, @type@)) { TO_SIMD_SFX(simd_binary_ccc_@intrin@)( (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len ); goto clear_fp; } // unroll scalars faster than non-contiguous vector load/store on Arm #if !defined(NPY_HAVE_NEON) && @is_fp@ if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) && TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) && TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE)) ) { TO_SIMD_SFX(simd_binary_@intrin@)( (STYPE*)ip1, is1/sizeof(STYPE), (STYPE*)ip2, is2/sizeof(STYPE), (STYPE*)op1, os1/sizeof(STYPE), len ); goto clear_fp; } #endif } #endif // TO_SIMD_SFX #ifndef NPY_DISABLE_OPTIMIZATION // scalar unrolls if (IS_BINARY_REDUCE) { // Note, 8x unroll was chosen for best results on Apple M1 npy_intp elemPerLoop = 8; if((i+elemPerLoop) <= len){ @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); i += elemPerLoop; for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); m0 = SCALAR_OP(m0, v0); m1 = SCALAR_OP(m1, v1); m2 = SCALAR_OP(m2, v2); m3 = SCALAR_OP(m3, v3); m4 = SCALAR_OP(m4, v4); m5 = SCALAR_OP(m5, v5); m6 = SCALAR_OP(m6, v6); m7 = SCALAR_OP(m7, v7); } m0 = SCALAR_OP(m0, m1); m2 = SCALAR_OP(m2, m3); m4 = SCALAR_OP(m4, m5); m6 = SCALAR_OP(m6, m7); m0 = SCALAR_OP(m0, m2); m4 = SCALAR_OP(m4, m6); m0 = SCALAR_OP(m0, m4); *((@type@ *)op1) = SCALAR_OP(*((@type@ *)op1), m0); } } else{ // Note, 4x unroll was chosen for best results on Apple M1 npy_intp elemPerLoop = 4; for(; (i+elemPerLoop)<=len; i+=elemPerLoop){ /* Note, we can't just load all, do all ops, then store all here. * Sometimes ufuncs are called with `accumulate`, which makes the * assumption that previous iterations have finished before next * iteration. For example, the output of iteration 2 depends on the * result of iteration 1. */ /**begin repeat2 * #unroll = 0, 1, 2, 3# */ @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1)); @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2)); *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@); /**end repeat2**/ } } #endif // NPY_DISABLE_OPTIMIZATION ip1 += is1 * i; ip2 += is2 * i; op1 += os1 * i; for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; *((@type@ *)op1) = SCALAR_OP(in1, in2); } #ifdef TO_SIMD_SFX clear_fp: npyv_cleanup(); #endif #if @is_fp@ npy_clear_floatstatus_barrier((char*)dimensions); #endif } NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed) (PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { char *ip1 = args[0]; char *indx = args[1]; char *value = args[2]; npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; npy_intp n = dimensions[0]; npy_intp i; @type@ *indexed; for(i = 0; i < n; i++, indx += isindex, value += isb) { indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx); *indexed = SCALAR_OP(*indexed, *(@type@ *)value); } return 0; } #undef SCALAR_OP #endif // !fp_only || (is_fp && fp_only) /**end repeat1**/ /**end repeat**/