1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
|
/*@targets
** $maxopt baseline
** neon asimd
** sse2 avx2 avx512_skx
** vsx2
** vx vxe
**/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"
/*******************************************************************************
** Scalar intrinsics
******************************************************************************/
// signed/unsigned int
#define scalar_max_i(A, B) ((A > B) ? A : B)
#define scalar_min_i(A, B) ((A < B) ? A : B)
// fp, propagates NaNs
#define scalar_max(A, B) ((A >= B || npy_isnan(A)) ? A : B)
#define scalar_max_f scalar_max
#define scalar_max_d scalar_max
#define scalar_max_l scalar_max
#define scalar_min(A, B) ((A <= B || npy_isnan(A)) ? A : B)
#define scalar_min_f scalar_min
#define scalar_min_d scalar_min
#define scalar_min_l scalar_min
// fp, ignores NaNs
#define scalar_maxp_f fmaxf
#define scalar_maxp_d fmax
#define scalar_maxp_l fmaxl
#define scalar_minp_f fminf
#define scalar_minp_d fmin
#define scalar_minp_l fminl
// special optimization for fp scalars propagates NaNs
// since there're no C99 support for it
#ifndef NPY_DISABLE_OPTIMIZATION
/**begin repeat
* #type = npy_float, npy_double#
* #sfx = f32, f64#
* #c_sfx = f, d#
* #isa_sfx = s, d#
* #sse_type = __m128, __m128d#
*/
/**begin repeat1
* #op = max, min#
* #neon_instr = fmax, fmin#
*/
#ifdef NPY_HAVE_SSE2
#undef scalar_@op@_@c_sfx@
NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
@sse_type@ va = _mm_set_s@isa_sfx@(a);
@sse_type@ vb = _mm_set_s@isa_sfx@(b);
@sse_type@ rv = _mm_@op@_s@isa_sfx@(va, vb);
// X86 handle second operand
@sse_type@ nn = _mm_cmpord_s@isa_sfx@(va, va);
#ifdef NPY_HAVE_SSE41
rv = _mm_blendv_p@isa_sfx@(va, rv, nn);
#else
rv = _mm_xor_p@isa_sfx@(va, _mm_and_p@isa_sfx@(_mm_xor_p@isa_sfx@(va, rv), nn));
#endif
return _mm_cvts@isa_sfx@_@sfx@(rv);
}
#endif // SSE2
#ifdef __aarch64__
#undef scalar_@op@_@c_sfx@
NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
@type@ result = 0;
__asm(
"@neon_instr@ %@isa_sfx@[result], %@isa_sfx@[a], %@isa_sfx@[b]"
: [result] "=w" (result)
: [a] "w" (a), [b] "w" (b)
);
return result;
}
#endif // __aarch64__
/**end repeat1**/
/**end repeat**/
#endif // NPY_DISABLE_OPTIMIZATION
// mapping to double if its possible
#if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE
/**begin repeat
* #op = max, min, maxp, minp#
*/
#undef scalar_@op@_l
#define scalar_@op@_l scalar_@op@_d
/**end repeat**/
#endif
/*******************************************************************************
** Defining the SIMD kernels
******************************************************************************/
/**begin repeat
* #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
* #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
* #is_fp = 0*8, 1, 1#
* #scalar_sfx = i*8, f, d#
*/
/**begin repeat1
* # intrin = max, min, maxp, minp#
* # fp_only = 0, 0, 1, 1#
*/
#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
#if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@))
#if @is_fp@ && !@fp_only@
#define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs
#define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@
#else
#define V_INTRIN npyv_@intrin@_@sfx@
#define V_REDUCE_INTRIN npyv_reduce_@intrin@_@sfx@
#endif
// contiguous input.
static inline void
simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len)
{
if (len < 1) {
return;
}
const int vstep = npyv_nlanes_@sfx@;
const int wstep = vstep*8;
npyv_@sfx@ acc = npyv_setall_@sfx@(op1[0]);
for (; len >= wstep; len -= wstep, ip += wstep) {
#ifdef NPY_HAVE_SSE2
NPY_PREFETCH(ip + wstep, 0, 3);
#endif
npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0);
npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1);
npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2);
npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3);
npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4);
npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5);
npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6);
npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7);
npyv_@sfx@ r01 = V_INTRIN(v0, v1);
npyv_@sfx@ r23 = V_INTRIN(v2, v3);
npyv_@sfx@ r45 = V_INTRIN(v4, v5);
npyv_@sfx@ r67 = V_INTRIN(v6, v7);
acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
}
for (; len >= vstep; len -= vstep, ip += vstep) {
acc = V_INTRIN(acc, npyv_load_@sfx@(ip));
}
npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(acc);
// Scalar - finish up any remaining iterations
for (; len > 0; --len, ++ip) {
const npyv_lanetype_@sfx@ in2 = *ip;
r = SCALAR_OP(r, in2);
}
op1[0] = r;
}
// contiguous inputs and output.
static inline void
simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2,
npyv_lanetype_@sfx@ *op1, npy_intp len)
{
#if NPY_SIMD_WIDTH == 128
// Note, 6x unroll was chosen for best results on Apple M1
const int vectorsPerLoop = 6;
#else
// To avoid memory bandwidth bottleneck
const int vectorsPerLoop = 2;
#endif
const int elemPerVector = npyv_nlanes_@sfx@;
int elemPerLoop = vectorsPerLoop * elemPerVector;
npy_intp i = 0;
for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]);
npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]);
#if NPY_SIMD_WIDTH == 128
npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]);
npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]);
npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]);
npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]);
#endif
npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
#if NPY_SIMD_WIDTH == 128
npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
#endif
npyv_@sfx@ m0 = V_INTRIN(v0, u0);
npyv_@sfx@ m1 = V_INTRIN(v1, u1);
#if NPY_SIMD_WIDTH == 128
npyv_@sfx@ m2 = V_INTRIN(v2, u2);
npyv_@sfx@ m3 = V_INTRIN(v3, u3);
npyv_@sfx@ m4 = V_INTRIN(v4, u4);
npyv_@sfx@ m5 = V_INTRIN(v5, u5);
#endif
npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0);
npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1);
#if NPY_SIMD_WIDTH == 128
npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2);
npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3);
npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4);
npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5);
#endif
}
for (; (i+elemPerVector) <= len; i += elemPerVector) {
npyv_@sfx@ v0 = npyv_load_@sfx@(ip1 + i);
npyv_@sfx@ u0 = npyv_load_@sfx@(ip2 + i);
npyv_@sfx@ m0 = V_INTRIN(v0, u0);
npyv_store_@sfx@(op1 + i, m0);
}
// Scalar - finish up any remaining iterations
for (; i < len; ++i) {
const npyv_lanetype_@sfx@ in1 = ip1[i];
const npyv_lanetype_@sfx@ in2 = ip2[i];
op1[i] = SCALAR_OP(in1, in2);
}
}
// non-contiguous for float 32/64-bit memory access
#if @is_fp@
static inline void
simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1,
const npyv_lanetype_@sfx@ *ip2, npy_intp sip2,
npyv_lanetype_@sfx@ *op1, npy_intp sop1,
npy_intp len)
{
const int vstep = npyv_nlanes_@sfx@;
for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
ip2 += sip2*vstep, op1 += sop1*vstep
) {
npyv_@sfx@ a, b;
if (sip1 == 1) {
a = npyv_load_@sfx@(ip1);
} else {
a = npyv_loadn_@sfx@(ip1, sip1);
}
if (sip2 == 1) {
b = npyv_load_@sfx@(ip2);
} else {
b = npyv_loadn_@sfx@(ip2, sip2);
}
npyv_@sfx@ r = V_INTRIN(a, b);
if (sop1 == 1) {
npyv_store_@sfx@(op1, r);
} else {
npyv_storen_@sfx@(op1, sop1, r);
}
}
for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
const npyv_lanetype_@sfx@ a = *ip1;
const npyv_lanetype_@sfx@ b = *ip2;
*op1 = SCALAR_OP(a, b);
}
}
#endif
#undef V_INTRIN
#undef V_REDUCE_INTRIN
#endif // simd_chk && (!fp_only || (is_fp && fp_only))
#undef SCALAR_OP
/**end repeat1**/
/**end repeat**/
/*******************************************************************************
** Defining ufunc inner functions
******************************************************************************/
/**begin repeat
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
* BYTE, SHORT, INT, LONG, LONGLONG,
* FLOAT, DOUBLE, LONGDOUBLE#
*
* #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG,
* BYTE, SHORT, INT, LONG, LONGLONG,
* FLOAT, DOUBLE, LONGDOUBLE#
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
* npy_byte, npy_short, npy_int, npy_long, npy_longlong,
* npy_float, npy_double, npy_longdouble#
*
* #is_fp = 0*10, 1*3#
* #is_unsigned = 1*5, 0*5, 0*3#
* #scalar_sfx = i*10, f, d, l#
*/
#undef TO_SIMD_SFX
#if 0
/**begin repeat1
* #len = 8, 16, 32, 64#
*/
#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
#if @is_fp@
#define TO_SIMD_SFX(X) X##_f@len@
#if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
#undef TO_SIMD_SFX
#endif
#if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
#undef TO_SIMD_SFX
#endif
#elif @is_unsigned@
#define TO_SIMD_SFX(X) X##_u@len@
#else
#define TO_SIMD_SFX(X) X##_s@len@
#endif
/**end repeat1**/
#endif
/**begin repeat1
* # kind = maximum, minimum, fmax, fmin#
* # intrin = max, min, maxp, minp#
* # fp_only = 0, 0, 1, 1#
*/
#if !@fp_only@ || (@is_fp@ && @fp_only@)
#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
len = dimensions[0];
npy_intp i = 0;
#ifdef TO_SIMD_SFX
#undef STYPE
#define STYPE TO_SIMD_SFX(npyv_lanetype)
if (IS_BINARY_REDUCE) {
// reduce and contiguous
if (is2 == sizeof(@type@)) {
TO_SIMD_SFX(simd_reduce_c_@intrin@)(
(STYPE*)ip2, (STYPE*)op1, len
);
goto clear_fp;
}
}
else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
!is_mem_overlap(ip2, is2, op1, os1, len)
) {
// no overlap and operands are binary contiguous
if (IS_BINARY_CONT(@type@, @type@)) {
TO_SIMD_SFX(simd_binary_ccc_@intrin@)(
(STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
);
goto clear_fp;
}
// unroll scalars faster than non-contiguous vector load/store on Arm
#if !defined(NPY_HAVE_NEON) && @is_fp@
if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
) {
TO_SIMD_SFX(simd_binary_@intrin@)(
(STYPE*)ip1, is1/sizeof(STYPE),
(STYPE*)ip2, is2/sizeof(STYPE),
(STYPE*)op1, os1/sizeof(STYPE), len
);
goto clear_fp;
}
#endif
}
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
// scalar unrolls
if (IS_BINARY_REDUCE) {
// Note, 8x unroll was chosen for best results on Apple M1
npy_intp elemPerLoop = 8;
if((i+elemPerLoop) <= len){
@type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
@type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
@type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
@type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
@type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
@type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
@type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
@type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
i += elemPerLoop;
for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
@type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
@type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
@type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
@type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
@type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
@type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
@type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
@type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
m0 = SCALAR_OP(m0, v0);
m1 = SCALAR_OP(m1, v1);
m2 = SCALAR_OP(m2, v2);
m3 = SCALAR_OP(m3, v3);
m4 = SCALAR_OP(m4, v4);
m5 = SCALAR_OP(m5, v5);
m6 = SCALAR_OP(m6, v6);
m7 = SCALAR_OP(m7, v7);
}
m0 = SCALAR_OP(m0, m1);
m2 = SCALAR_OP(m2, m3);
m4 = SCALAR_OP(m4, m5);
m6 = SCALAR_OP(m6, m7);
m0 = SCALAR_OP(m0, m2);
m4 = SCALAR_OP(m4, m6);
m0 = SCALAR_OP(m0, m4);
*((@type@ *)op1) = SCALAR_OP(*((@type@ *)op1), m0);
}
} else{
// Note, 4x unroll was chosen for best results on Apple M1
npy_intp elemPerLoop = 4;
for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
/* Note, we can't just load all, do all ops, then store all here.
* Sometimes ufuncs are called with `accumulate`, which makes the
* assumption that previous iterations have finished before next
* iteration. For example, the output of iteration 2 depends on the
* result of iteration 1.
*/
/**begin repeat2
* #unroll = 0, 1, 2, 3#
*/
@type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
@type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
*((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
/**end repeat2**/
}
}
#endif // NPY_DISABLE_OPTIMIZATION
ip1 += is1 * i;
ip2 += is2 * i;
op1 += os1 * i;
for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
const @type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
*((@type@ *)op1) = SCALAR_OP(in1, in2);
}
#ifdef TO_SIMD_SFX
clear_fp:
npyv_cleanup();
#endif
#if @is_fp@
npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
char *ip1 = args[0];
char *indx = args[1];
char *value = args[2];
npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
npy_intp n = dimensions[0];
npy_intp i;
@type@ *indexed;
for(i = 0; i < n; i++, indx += isindex, value += isb) {
indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
*indexed = SCALAR_OP(*indexed, *(@type@ *)value);
}
return 0;
}
#undef SCALAR_OP
#endif // !fp_only || (is_fp && fp_only)
/**end repeat1**/
/**end repeat**/
|