1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
|
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include "npy_config.h"
#include "numpy/arrayobject.h"
#define NPY_NUMBER_MAX(a, b) ((a) > (b) ? (a) : (b))
#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
/*
* Functions used to try to avoid/elide temporaries in python expressions
* of type a + b + b by translating some operations into in-place operations.
* This example translates to this bytecode:
*
* 0 LOAD_FAST 0 (a)
* 3 LOAD_FAST 1 (b)
* 6 BINARY_ADD
* 7 LOAD_FAST 1 (b)
* 10 BINARY_ADD
*
* The two named variables get their reference count increased by the load
* instructions so they always have a reference count larger than 1.
* The temporary of the first BINARY_ADD on the other hand only has a count of
* 1. Only temporaries can have a count of 1 in python so we can use this to
* transform the second operation into an in-place operation and not affect the
* output of the program.
* CPython does the same thing to resize memory instead of copying when doing
* string concatenation.
* The gain can be very significant (4x-6x) when avoiding the temporary allows
* the operation to remain in the cpu caches and can still be 50% faster for
* array larger than cpu cache size.
*
* A complication is that a DSO (dynamic shared object) module (e.g. cython)
* could call the PyNumber functions directly on arrays with reference count of
* 1.
* This is handled by checking the call stack to verify that we have been
* called directly from the cpython interpreter.
* To achieve this we check that all functions in the callstack until the
* cpython frame evaluation function are located in cpython or numpy.
* This is an expensive operation so temporaries are only avoided for rather
* large arrays.
*
* A possible future improvement would be to change cpython to give us access
* to the top of the stack. Then we could just check that the objects involved
* are on the cpython stack instead of checking the function callstack.
*
* Elision can be applied to all operations that do have in-place variants and
* do not change types (addition, subtraction, multiplication, float division,
* logical and bitwise operations ...)
* For commutative operations (addition, multiplication, ...) if eliding into
* the lefthand side fails it can succeed on the righthand side by swapping the
* arguments. E.g. b * (a * 2) can be elided by changing it to (2 * a) * b.
*
* TODO only supports systems with backtrace(), Windows can probably be
* supported too by using the appropriate Windows APIs.
*/
#if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION
/* 1 prints elided operations, 2 prints stacktraces */
#define NPY_ELIDE_DEBUG 0
#define NPY_MAX_STACKSIZE 10
/* TODO can pep523 be used to somehow? */
#define PYFRAMEEVAL_FUNC "_PyEval_EvalFrameDefault"
/*
* Heuristic size of the array in bytes at which backtrace overhead generation
* becomes less than speed gained by in-place operations. Depends on stack depth
* being checked. Measurements with 10 stacks show it getting worthwhile
* around 100KiB but to be conservative put it higher around where the L2 cache
* spills.
*/
#ifndef Py_DEBUG
#define NPY_MIN_ELIDE_BYTES (256 * 1024)
#else
/*
* in debug mode always elide but skip scalars as these can convert to 0d array
* during in-place operations
*/
#define NPY_MIN_ELIDE_BYTES (32)
#endif
#include <dlfcn.h>
#if defined HAVE_EXECINFO_H
#include <execinfo.h>
#elif defined HAVE_LIBUNWIND_H
#include <libunwind.h>
#endif
/*
* linear search pointer in table
* number of pointers is usually quite small but if a performance impact can be
* measured this could be converted to a binary search
*/
static int
find_addr(void * addresses[], npy_intp naddr, void * addr)
{
npy_intp j;
for (j = 0; j < naddr; j++) {
if (addr == addresses[j]) {
return 1;
}
}
return 0;
}
static int
check_callers(int * cannot)
{
/*
* get base addresses of multiarray and python, check if
* backtrace is in these libraries only calling dladdr if a new max address
* is found.
* When after the initial multiarray stack everything is inside python we
* can elide as no C-API user could have messed up the reference counts.
* Only check until the python frame evaluation function is found
* approx 10us overhead for stack size of 10
*
* TODO some calls go over scalarmath in umath but we cannot get the base
* address of it from multiarraymodule as it is not linked against it
*/
static int init = 0;
/*
* measured DSO object memory start and end, if an address is located
* inside these bounds it is part of that library so we don't need to call
* dladdr on it (assuming linear memory)
*/
static void * pos_python_start;
static void * pos_python_end;
static void * pos_ma_start;
static void * pos_ma_end;
/* known address storage to save dladdr calls */
static void * py_addr[64];
static void * pyeval_addr[64];
static npy_intp n_py_addr = 0;
static npy_intp n_pyeval = 0;
void *buffer[NPY_MAX_STACKSIZE];
int i, nptrs;
int ok = 0;
/* cannot determine callers */
if (init == -1) {
*cannot = 1;
return 0;
}
nptrs = backtrace(buffer, NPY_MAX_STACKSIZE);
if (nptrs == 0) {
/* complete failure, disable elision */
init = -1;
*cannot = 1;
return 0;
}
/* setup DSO base addresses, ends updated later */
if (NPY_UNLIKELY(init == 0)) {
Dl_info info;
/* get python base address */
if (dladdr(&PyNumber_Or, &info)) {
pos_python_start = info.dli_fbase;
pos_python_end = info.dli_fbase;
}
else {
init = -1;
return 0;
}
/* get multiarray base address */
if (dladdr(&PyArray_INCREF, &info)) {
pos_ma_start = info.dli_fbase;
pos_ma_end = info.dli_fbase;
}
else {
init = -1;
return 0;
}
init = 1;
}
/* loop over callstack addresses to check if they leave numpy or cpython */
for (i = 0; i < nptrs; i++) {
Dl_info info;
int in_python = 0;
int in_multiarray = 0;
#if NPY_ELIDE_DEBUG >= 2
dladdr(buffer[i], &info);
printf("%s(%p) %s(%p)\n", info.dli_fname, info.dli_fbase,
info.dli_sname, info.dli_saddr);
#endif
/* check stored DSO boundaries first */
if (buffer[i] >= pos_python_start && buffer[i] <= pos_python_end) {
in_python = 1;
}
else if (buffer[i] >= pos_ma_start && buffer[i] <= pos_ma_end) {
in_multiarray = 1;
}
/* update DSO boundaries via dladdr if necessary */
if (!in_python && !in_multiarray) {
if (dladdr(buffer[i], &info) == 0) {
init = -1;
ok = 0;
break;
}
/* update DSO end */
if (info.dli_fbase == pos_python_start) {
pos_python_end = NPY_NUMBER_MAX(buffer[i], pos_python_end);
in_python = 1;
}
else if (info.dli_fbase == pos_ma_start) {
pos_ma_end = NPY_NUMBER_MAX(buffer[i], pos_ma_end);
in_multiarray = 1;
}
}
/* no longer in ok libraries and not reached PyEval -> no elide */
if (!in_python && !in_multiarray) {
ok = 0;
break;
}
/* in python check if the frame eval function was reached */
if (in_python) {
/* if reached eval we are done */
if (find_addr(pyeval_addr, n_pyeval, buffer[i])) {
ok = 1;
break;
}
/*
* check if its some other function, use pointer lookup table to
* save expensive dladdr calls
*/
if (find_addr(py_addr, n_py_addr, buffer[i])) {
continue;
}
/* new python address, check for PyEvalFrame */
if (dladdr(buffer[i], &info) == 0) {
init = -1;
ok = 0;
break;
}
if (info.dli_sname &&
strcmp(info.dli_sname, PYFRAMEEVAL_FUNC) == 0) {
if (n_pyeval < (npy_intp)ARRAY_SIZE(pyeval_addr)) {
/* store address to not have to dladdr it again */
pyeval_addr[n_pyeval++] = buffer[i];
}
ok = 1;
break;
}
else if (n_py_addr < (npy_intp)ARRAY_SIZE(py_addr)) {
/* store other py function to not have to dladdr it again */
py_addr[n_py_addr++] = buffer[i];
}
}
}
/* all stacks after numpy are from python, we can elide */
if (ok) {
*cannot = 0;
return 1;
}
else {
#if NPY_ELIDE_DEBUG != 0
puts("cannot elide due to c-api usage");
#endif
*cannot = 1;
return 0;
}
}
/*
* check if in "alhs @op@ orhs" that alhs is a temporary (refcnt == 1) so we
* can do in-place operations instead of creating a new temporary
* "cannot" is set to true if it cannot be done even with swapped arguments
*/
static int
can_elide_temp(PyObject *olhs, PyObject *orhs, int *cannot)
{
/*
* to be a candidate the array needs to have reference count 1, be an exact
* array of a basic type, own its data and size larger than threshold
*/
PyArrayObject *alhs = (PyArrayObject *)olhs;
if (Py_REFCNT(olhs) != 1 || !PyArray_CheckExact(olhs) ||
!PyArray_ISNUMBER(alhs) ||
!PyArray_CHKFLAGS(alhs, NPY_ARRAY_OWNDATA) ||
!PyArray_ISWRITEABLE(alhs) ||
PyArray_CHKFLAGS(alhs, NPY_ARRAY_WRITEBACKIFCOPY) ||
PyArray_NBYTES(alhs) < NPY_MIN_ELIDE_BYTES) {
return 0;
}
if (PyArray_CheckExact(orhs) ||
PyArray_CheckAnyScalar(orhs)) {
PyArrayObject * arhs;
/* create array from right hand side */
Py_INCREF(orhs);
arhs = (PyArrayObject *)PyArray_EnsureArray(orhs);
if (arhs == NULL) {
return 0;
}
/*
* if rhs is not a scalar dimensions must match
* TODO: one could allow broadcasting on equal types
*/
if (!(PyArray_NDIM(arhs) == 0 ||
(PyArray_NDIM(arhs) == PyArray_NDIM(alhs) &&
PyArray_CompareLists(PyArray_DIMS(alhs), PyArray_DIMS(arhs),
PyArray_NDIM(arhs))))) {
Py_DECREF(arhs);
return 0;
}
/* must be safe to cast (checks values for scalar in rhs) */
if (PyArray_CanCastArrayTo(arhs, PyArray_DESCR(alhs),
NPY_SAFE_CASTING)) {
Py_DECREF(arhs);
return check_callers(cannot);
}
Py_DECREF(arhs);
}
return 0;
}
/*
* try eliding a binary op, if commutative is true also try swapped arguments
*/
NPY_NO_EXPORT int
try_binary_elide(PyObject * m1, PyObject * m2,
PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
PyObject ** res, int commutative)
{
/* set when no elision can be done independent of argument order */
int cannot = 0;
if (can_elide_temp(m1, m2, &cannot)) {
*res = inplace_op((PyArrayObject *)m1, m2);
#if NPY_ELIDE_DEBUG != 0
puts("elided temporary in binary op");
#endif
return 1;
}
else if (commutative && !cannot) {
if (can_elide_temp(m2, m1, &cannot)) {
*res = inplace_op((PyArrayObject *)m2, m1);
#if NPY_ELIDE_DEBUG != 0
puts("elided temporary in commutative binary op");
#endif
return 1;
}
}
*res = NULL;
return 0;
}
/* try elide unary temporary */
NPY_NO_EXPORT int
can_elide_temp_unary(PyArrayObject * m1)
{
int cannot;
if (Py_REFCNT(m1) != 1 || !PyArray_CheckExact(m1) ||
!PyArray_ISNUMBER(m1) ||
!PyArray_CHKFLAGS(m1, NPY_ARRAY_OWNDATA) ||
!PyArray_ISWRITEABLE(m1) ||
PyArray_NBYTES(m1) < NPY_MIN_ELIDE_BYTES) {
return 0;
}
if (check_callers(&cannot)) {
#if NPY_ELIDE_DEBUG != 0
puts("elided temporary in unary op");
#endif
return 1;
}
else {
return 0;
}
}
#else /* unsupported interpreter or missing backtrace */
NPY_NO_EXPORT int
can_elide_temp_unary(PyArrayObject * m1)
{
return 0;
}
NPY_NO_EXPORT int
try_binary_elide(PyArrayObject * m1, PyObject * m2,
PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
PyObject ** res, int commutative)
{
*res = NULL;
return 0;
}
#endif
|