1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
|
/*@targets
** $maxopt baseline
** (avx2 fma3) avx512f
** vsx2 vsx3 vsx4
** neon_vfpv4
** vxe vxe2
**/
#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "fast_loop_macros.h"
/*
* TODO:
* - use vectorized version of Payne-Hanek style reduction for large elements or
* when there's no native FUSED support instead of fallback to libc
*/
#if NPY_SIMD_FMA3 // native support
/**begin repeat
* #check = F64, F32#
* #sfx = f64, f32#
*/
#if NPY_SIMD_@check@
/*
* Vectorized Cody-Waite range reduction technique
* Performs the reduction step x* = x - y*C in three steps:
* 1) x* = x - y*c1
* 2) x* = x - y*c2
* 3) x* = x - y*c3
* c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
*/
NPY_FINLINE npyv_@sfx@
simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@ c2, npyv_@sfx@ c3)
{
npyv_@sfx@ reduced_x = npyv_muladd_@sfx@(y, c1, x);
reduced_x = npyv_muladd_@sfx@(y, c2, reduced_x);
reduced_x = npyv_muladd_@sfx@(y, c3, reduced_x);
return reduced_x;
}
#endif
/**end repeat**/
#if NPY_SIMD_F64
/**begin repeat
* #op = cos, sin#
*/
#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
NPY_FINLINE npyv_f64
#else
NPY_NOINLINE npyv_f64
#endif
simd_@op@_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
{
// MSVC doesn't compile with direct vector access, so we copy it here
// as we have no npyv_get_lane/npyv_set_lane intrinsics
npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64];
npyv_storea_f64(out_copy, out);
for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
if (cmp_bits & (1 << i)) {
out_copy[i] = npy_@op@(out_copy[i]);
}
}
return npyv_loada_f64(out_copy);
}
/**end repeat**/
/*
* Approximate sine algorithm for x \in [-pi/2, pi/2]
* worst-case error is 3.5 ulp.
* abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].
*/
NPY_FINLINE npyv_f64
simd_approx_sine_poly_f64(npyv_f64 r)
{
const npyv_f64 poly1 = npyv_setall_f64(-0x1.9f4a9c8b21dc9p-41);
const npyv_f64 poly2 = npyv_setall_f64(0x1.60e88a10163f2p-33);
const npyv_f64 poly3 = npyv_setall_f64(-0x1.ae6361b7254e7p-26);
const npyv_f64 poly4 = npyv_setall_f64(0x1.71de382e8d62bp-19);
const npyv_f64 poly5 = npyv_setall_f64(-0x1.a01a019aeb4ffp-13);
const npyv_f64 poly6 = npyv_setall_f64(0x1.111111110b25ep-7);
const npyv_f64 poly7 = npyv_setall_f64(-0x1.55555555554c3p-3);
npyv_f64 r2 = npyv_mul_f64(r, r);
npyv_f64 y = npyv_muladd_f64(poly1, r2, poly2);
y = npyv_muladd_f64(y, r2, poly3);
y = npyv_muladd_f64(y, r2, poly4);
y = npyv_muladd_f64(y, r2, poly5);
y = npyv_muladd_f64(y, r2, poly6);
y = npyv_muladd_f64(y, r2, poly7);
y = npyv_muladd_f64(npyv_mul_f64(y, r2), r, r);
return y;
}
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
NPY_FINLINE npyv_f64
simd_range_reduction_pi2(npyv_f64 r, npyv_f64 n) {
const npyv_f64 pi1 = npyv_setall_f64(-0x1.921fb54442d18p+1);
const npyv_f64 pi2 = npyv_setall_f64(-0x1.1a62633145c06p-53);
const npyv_f64 pi3 = npyv_setall_f64(-0x1.c1cd129024e09p-106);
return simd_range_reduction_f64(r, n, pi1, pi2, pi3);
}
NPY_FINLINE npyv_b64 simd_sin_range_check_f64(npyv_u64 ir) {
const npyv_u64 tiny_bound = npyv_setall_u64(0x202); /* top12 (asuint64 (0x1p-509)). */
const npyv_u64 simd_thresh = npyv_setall_u64(0x214); /* top12 (asuint64 (RangeVal)) - SIMD_TINY_BOUND. */
return npyv_cmpge_u64(npyv_sub_u64(npyv_shri_u64(ir, 52), tiny_bound), simd_thresh);
}
NPY_FINLINE npyv_b64 simd_cos_range_check_f64(npyv_u64 ir) {
const npyv_f64 range_val = npyv_setall_f64(0x1p23);
return npyv_cmpge_u64(ir, npyv_reinterpret_u64_f64(range_val));
}
NPY_FINLINE npyv_f64
simd_cos_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
{
const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
const npyv_f64 half_pi = npyv_setall_f64(0x1.921fb54442d18p+0);
const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
/* n = rint((|x|+pi/2)/pi) - 0.5. */
npyv_f64 n = npyv_muladd_f64(inv_pi, npyv_add_f64(r, half_pi), shift);
npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
n = npyv_sub_f64(n, shift);
n = npyv_sub_f64(n, npyv_setall_f64(0.5));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = simd_range_reduction_pi2(r, n);
/* sin(r) poly approx. */
npyv_f64 y = simd_approx_sine_poly_f64(r);
/* sign. */
return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), odd));
}
NPY_FINLINE npyv_f64
simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
{
const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
/* n = rint(|x|/pi). */
npyv_f64 n = npyv_muladd_f64(inv_pi, r, shift);
npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
n = npyv_sub_f64(n, shift);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = simd_range_reduction_pi2(r, n);
/* sin(r) poly approx. */
npyv_f64 y = simd_approx_sine_poly_f64(r);
/* sign. */
return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
}
/**begin repeat
* #op = cos, sin#
*/
NPY_FINLINE void
simd_@op@_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
{
const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff);
const int vstep = npyv_nlanes_f64;
npyv_f64 out = npyv_zero_f64();
npyv_f64 x_in;
for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
if (ssrc == 1) {
x_in = npyv_load_tillz_f64(src, len);
} else {
x_in = npyv_loadn_tillz_f64(src, ssrc, len);
}
npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask);
npyv_f64 r = npyv_reinterpret_f64_u64(ir);
npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask));
npyv_b64 cmp = simd_@op@_range_check_f64(ir);
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
scalar loop later. */
r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r);
// Some in range, at least one calculation is useful
if (!npyv_all_b64(cmp)) {
out = simd_@op@_poly_f64(r, ir, sign);
}
if (npyv_any_b64(cmp)) {
out = npyv_select_f64(cmp, x_in, out);
out = simd_@op@_scalar_f64(out, npyv_tobits_b64(cmp));
}
if (sdst == 1) {
npyv_store_till_f64(dst, len, out);
} else {
npyv_storen_till_f64(dst, sdst, len, out);
}
}
npyv_cleanup();
}
/**end repeat**/
#endif // NPY_SIMD_F64
#if NPY_SIMD_F32
/*
* Approximate cosine algorithm for x \in [-PI/4, PI/4]
* Maximum ULP across all 32-bit floats = 0.875
*/
NPY_FINLINE npyv_f32
simd_cosine_poly_f32(npyv_f32 x2)
{
const npyv_f32 invf8 = npyv_setall_f32(0x1.98e616p-16f);
const npyv_f32 invf6 = npyv_setall_f32(-0x1.6c06dcp-10f);
const npyv_f32 invf4 = npyv_setall_f32(0x1.55553cp-05f);
const npyv_f32 invf2 = npyv_setall_f32(-0x1.000000p-01f);
const npyv_f32 invf0 = npyv_setall_f32(0x1.000000p+00f);
npyv_f32 r = npyv_muladd_f32(invf8, x2, invf6);
r = npyv_muladd_f32(r, x2, invf4);
r = npyv_muladd_f32(r, x2, invf2);
r = npyv_muladd_f32(r, x2, invf0);
return r;
}
/*
* Approximate sine algorithm for x \in [-PI/4, PI/4]
* Maximum ULP across all 32-bit floats = 0.647
* Polynomial approximation based on unpublished work by T. Myklebust
*/
NPY_FINLINE npyv_f32
simd_sine_poly_f32(npyv_f32 x, npyv_f32 x2)
{
const npyv_f32 invf9 = npyv_setall_f32(0x1.7d3bbcp-19f);
const npyv_f32 invf7 = npyv_setall_f32(-0x1.a06bbap-13f);
const npyv_f32 invf5 = npyv_setall_f32(0x1.11119ap-07f);
const npyv_f32 invf3 = npyv_setall_f32(-0x1.555556p-03f);
npyv_f32 r = npyv_muladd_f32(invf9, x2, invf7);
r = npyv_muladd_f32(r, x2, invf5);
r = npyv_muladd_f32(r, x2, invf3);
r = npyv_muladd_f32(r, x2, npyv_zero_f32());
r = npyv_muladd_f32(r, x, x);
return r;
}
/*
* Vectorized approximate sine/cosine algorithms: The following code is a
* vectorized version of the algorithm presented here:
* https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
* (1) Load data in registers and generate mask for elements that are
* within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
* 117435.992f] for sine.
* (2) For elements within range, perform range reduction using Cody-Waite's
* method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
* (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
* int(y).
* (4) For elements outside that range, Cody-Waite reduction performs poorly
* leading to catastrophic cancellation. We compute cosine by calling glibc in
* a scalar fashion.
* (5) Vectorized implementation has a max ULP of 1.49 and performs at least
* 5-7x(x86) - 2.5-3x(Power) - 1-2x(Arm) faster than scalar implementations
* when magnitude of all elements in the array < 71476.0625f (117435.992f for sine).
* Worst case performance is when all the elements are large leading to about 1-2% reduction in
* performance.
*/
typedef enum
{
SIMD_COMPUTE_SIN,
SIMD_COMPUTE_COS
} SIMD_TRIG_OP;
static void SIMD_MSVC_NOINLINE
simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
npy_intp len, SIMD_TRIG_OP trig_op)
{
// Load up frequently used constants
const npyv_f32 zerosf = npyv_zero_f32();
const npyv_s32 ones = npyv_setall_s32(1);
const npyv_s32 twos = npyv_setall_s32(2);
const npyv_f32 two_over_pi = npyv_setall_f32(0x1.45f306p-1f);
const npyv_f32 codyw_pio2_highf = npyv_setall_f32(-0x1.921fb0p+00f);
const npyv_f32 codyw_pio2_medf = npyv_setall_f32(-0x1.5110b4p-22f);
const npyv_f32 codyw_pio2_lowf = npyv_setall_f32(-0x1.846988p-48f);
const npyv_f32 rint_cvt_magic = npyv_setall_f32(0x1.800000p+23f);
// Cody-Waite's range
float max_codi = 117435.992f;
if (trig_op == SIMD_COMPUTE_COS) {
max_codi = 71476.0625f;
}
const npyv_f32 max_cody = npyv_setall_f32(max_codi);
const int vstep = npyv_nlanes_f32;
for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
npyv_f32 x_in;
if (ssrc == 1) {
x_in = npyv_load_tillz_f32(src, len);
} else {
x_in = npyv_loadn_tillz_f32(src, ssrc, len);
}
npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
#if NPY_SIMD_CMPSIGNAL
// Eliminate NaN to avoid FP invalid exception
x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask)));
#endif
npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
/*
* For elements outside of this range, Cody-Waite's range reduction
* becomes inaccurate and we will call libc to compute cosine for
* these numbers
*/
if (simd_maski != 0) {
npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
// round to nearest, -0.0f -> +0.0f, and |a| must be <= 0x1.0p+22
quadrant = npyv_add_f32(quadrant, rint_cvt_magic);
quadrant = npyv_sub_f32(quadrant, rint_cvt_magic);
// Cody-Waite's range reduction algorithm
npyv_f32 reduced_x = simd_range_reduction_f32(
x, quadrant, codyw_pio2_highf, codyw_pio2_medf, codyw_pio2_lowf
);
npyv_f32 reduced_x2 = npyv_square_f32(reduced_x);
// compute cosine and sine
npyv_f32 cos = simd_cosine_poly_f32(reduced_x2);
npyv_f32 sin = simd_sine_poly_f32(reduced_x, reduced_x2);
npyv_s32 iquadrant = npyv_round_s32_f32(quadrant);
if (trig_op == SIMD_COMPUTE_COS) {
iquadrant = npyv_add_s32(iquadrant, ones);
}
// blend sin and cos based on the quadrant
npyv_b32 sine_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, ones), npyv_zero_s32());
cos = npyv_select_f32(sine_mask, sin, cos);
// multiply by -1 for appropriate elements
npyv_b32 negate_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, twos), twos);
cos = npyv_ifsub_f32(negate_mask, zerosf, cos, cos);
cos = npyv_select_f32(nnan_mask, cos, npyv_setall_f32(NPY_NANF));
if (sdst == 1) {
npyv_store_till_f32(dst, len, cos);
} else {
npyv_storen_till_f32(dst, sdst, len, cos);
}
}
if (simd_maski != (npy_uint64)((1 << vstep) - 1)) {
float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[npyv_nlanes_f32];
npyv_storea_f32(ip_fback, x_in);
// process elements using libc for large elements
if (trig_op == SIMD_COMPUTE_COS) {
for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
if ((simd_maski >> i) & 1) {
continue;
}
dst[sdst*i] = npy_cosf(ip_fback[i]);
}
}
else {
for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
if ((simd_maski >> i) & 1) {
continue;
}
dst[sdst*i] = npy_sinf(ip_fback[i]);
}
}
}
}
npyv_cleanup();
}
#endif // NPY_SIMD_FP32
#endif // NYP_SIMD_FMA3
/**begin repeat
* #func = cos, sin#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if NPY_SIMD_F64 && NPY_SIMD_FMA3
const double *src = (double*)args[0];
double *dst = (double*)args[1];
const int lsize = sizeof(src[0]);
const npy_intp ssrc = steps[0] / lsize;
const npy_intp sdst = steps[1] / lsize;
npy_intp len = dimensions[0];
assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
) {
for (; len > 0; --len, src += ssrc, dst += sdst) {
simd_@func@_f64(src, 1, dst, 1, 1);
}
} else {
simd_@func@_f64(src, ssrc, dst, sdst, len);
}
#else
UNARY_LOOP {
const npy_double in1 = *(npy_double *)ip1;
*(npy_double *)op1 = npy_@func@(in1);
}
#endif
}
/**end repeat**/
/**begin repeat
* #func = sin, cos#
* #enum = SIMD_COMPUTE_SIN, SIMD_COMPUTE_COS#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if NPY_SIMD_F32 && NPY_SIMD_FMA3
const npy_float *src = (npy_float*)args[0];
npy_float *dst = (npy_float*)args[1];
const int lsize = sizeof(src[0]);
const npy_intp ssrc = steps[0] / lsize;
const npy_intp sdst = steps[1] / lsize;
npy_intp len = dimensions[0];
assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
) {
for (; len > 0; --len, src += ssrc, dst += sdst) {
simd_sincos_f32(src, 1, dst, 1, 1, @enum@);
}
} else {
simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@);
}
#else
UNARY_LOOP {
const npy_float in1 = *(npy_float *)ip1;
*(npy_float *)op1 = npy_@func@f(in1);
}
#endif
}
/**end repeat**/
|