#define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE #define PY_SSIZE_T_CLEAN #include #include "npy_config.h" #include "numpy/arrayobject.h" #define NPY_NUMBER_MAX(a, b) ((a) > (b) ? (a) : (b)) #define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0])) /* * Functions used to try to avoid/elide temporaries in python expressions * of type a + b + b by translating some operations into in-place operations. * This example translates to this bytecode: * * 0 LOAD_FAST 0 (a) * 3 LOAD_FAST 1 (b) * 6 BINARY_ADD * 7 LOAD_FAST 1 (b) * 10 BINARY_ADD * * The two named variables get their reference count increased by the load * instructions so they always have a reference count larger than 1. * The temporary of the first BINARY_ADD on the other hand only has a count of * 1. Only temporaries can have a count of 1 in python so we can use this to * transform the second operation into an in-place operation and not affect the * output of the program. * CPython does the same thing to resize memory instead of copying when doing * string concatenation. * The gain can be very significant (4x-6x) when avoiding the temporary allows * the operation to remain in the cpu caches and can still be 50% faster for * array larger than cpu cache size. * * A complication is that a DSO (dynamic shared object) module (e.g. cython) * could call the PyNumber functions directly on arrays with reference count of * 1. * This is handled by checking the call stack to verify that we have been * called directly from the cpython interpreter. * To achieve this we check that all functions in the callstack until the * cpython frame evaluation function are located in cpython or numpy. * This is an expensive operation so temporaries are only avoided for rather * large arrays. * * A possible future improvement would be to change cpython to give us access * to the top of the stack. Then we could just check that the objects involved * are on the cpython stack instead of checking the function callstack. * * Elision can be applied to all operations that do have in-place variants and * do not change types (addition, subtraction, multiplication, float division, * logical and bitwise operations ...) * For commutative operations (addition, multiplication, ...) if eliding into * the lefthand side fails it can succeed on the righthand side by swapping the * arguments. E.g. b * (a * 2) can be elided by changing it to (2 * a) * b. * * TODO only supports systems with backtrace(), Windows can probably be * supported too by using the appropriate Windows APIs. */ #if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION /* 1 prints elided operations, 2 prints stacktraces */ #define NPY_ELIDE_DEBUG 0 #define NPY_MAX_STACKSIZE 10 /* TODO can pep523 be used to somehow? */ #define PYFRAMEEVAL_FUNC "_PyEval_EvalFrameDefault" /* * Heuristic size of the array in bytes at which backtrace overhead generation * becomes less than speed gained by in-place operations. Depends on stack depth * being checked. Measurements with 10 stacks show it getting worthwhile * around 100KiB but to be conservative put it higher around where the L2 cache * spills. */ #ifndef Py_DEBUG #define NPY_MIN_ELIDE_BYTES (256 * 1024) #else /* * in debug mode always elide but skip scalars as these can convert to 0d array * during in-place operations */ #define NPY_MIN_ELIDE_BYTES (32) #endif #include #if defined HAVE_EXECINFO_H #include #elif defined HAVE_LIBUNWIND_H #include #endif /* * linear search pointer in table * number of pointers is usually quite small but if a performance impact can be * measured this could be converted to a binary search */ static int find_addr(void * addresses[], npy_intp naddr, void * addr) { npy_intp j; for (j = 0; j < naddr; j++) { if (addr == addresses[j]) { return 1; } } return 0; } static int check_callers(int * cannot) { /* * get base addresses of multiarray and python, check if * backtrace is in these libraries only calling dladdr if a new max address * is found. * When after the initial multiarray stack everything is inside python we * can elide as no C-API user could have messed up the reference counts. * Only check until the python frame evaluation function is found * approx 10us overhead for stack size of 10 * * TODO some calls go over scalarmath in umath but we cannot get the base * address of it from multiarraymodule as it is not linked against it */ static int init = 0; /* * measured DSO object memory start and end, if an address is located * inside these bounds it is part of that library so we don't need to call * dladdr on it (assuming linear memory) */ static void * pos_python_start; static void * pos_python_end; static void * pos_ma_start; static void * pos_ma_end; /* known address storage to save dladdr calls */ static void * py_addr[64]; static void * pyeval_addr[64]; static npy_intp n_py_addr = 0; static npy_intp n_pyeval = 0; void *buffer[NPY_MAX_STACKSIZE]; int i, nptrs; int ok = 0; /* cannot determine callers */ if (init == -1) { *cannot = 1; return 0; } nptrs = backtrace(buffer, NPY_MAX_STACKSIZE); if (nptrs == 0) { /* complete failure, disable elision */ init = -1; *cannot = 1; return 0; } /* setup DSO base addresses, ends updated later */ if (NPY_UNLIKELY(init == 0)) { Dl_info info; /* get python base address */ if (dladdr(&PyNumber_Or, &info)) { pos_python_start = info.dli_fbase; pos_python_end = info.dli_fbase; } else { init = -1; return 0; } /* get multiarray base address */ if (dladdr(&PyArray_INCREF, &info)) { pos_ma_start = info.dli_fbase; pos_ma_end = info.dli_fbase; } else { init = -1; return 0; } init = 1; } /* loop over callstack addresses to check if they leave numpy or cpython */ for (i = 0; i < nptrs; i++) { Dl_info info; int in_python = 0; int in_multiarray = 0; #if NPY_ELIDE_DEBUG >= 2 dladdr(buffer[i], &info); printf("%s(%p) %s(%p)\n", info.dli_fname, info.dli_fbase, info.dli_sname, info.dli_saddr); #endif /* check stored DSO boundaries first */ if (buffer[i] >= pos_python_start && buffer[i] <= pos_python_end) { in_python = 1; } else if (buffer[i] >= pos_ma_start && buffer[i] <= pos_ma_end) { in_multiarray = 1; } /* update DSO boundaries via dladdr if necessary */ if (!in_python && !in_multiarray) { if (dladdr(buffer[i], &info) == 0) { init = -1; ok = 0; break; } /* update DSO end */ if (info.dli_fbase == pos_python_start) { pos_python_end = NPY_NUMBER_MAX(buffer[i], pos_python_end); in_python = 1; } else if (info.dli_fbase == pos_ma_start) { pos_ma_end = NPY_NUMBER_MAX(buffer[i], pos_ma_end); in_multiarray = 1; } } /* no longer in ok libraries and not reached PyEval -> no elide */ if (!in_python && !in_multiarray) { ok = 0; break; } /* in python check if the frame eval function was reached */ if (in_python) { /* if reached eval we are done */ if (find_addr(pyeval_addr, n_pyeval, buffer[i])) { ok = 1; break; } /* * check if its some other function, use pointer lookup table to * save expensive dladdr calls */ if (find_addr(py_addr, n_py_addr, buffer[i])) { continue; } /* new python address, check for PyEvalFrame */ if (dladdr(buffer[i], &info) == 0) { init = -1; ok = 0; break; } if (info.dli_sname && strcmp(info.dli_sname, PYFRAMEEVAL_FUNC) == 0) { if (n_pyeval < (npy_intp)ARRAY_SIZE(pyeval_addr)) { /* store address to not have to dladdr it again */ pyeval_addr[n_pyeval++] = buffer[i]; } ok = 1; break; } else if (n_py_addr < (npy_intp)ARRAY_SIZE(py_addr)) { /* store other py function to not have to dladdr it again */ py_addr[n_py_addr++] = buffer[i]; } } } /* all stacks after numpy are from python, we can elide */ if (ok) { *cannot = 0; return 1; } else { #if NPY_ELIDE_DEBUG != 0 puts("cannot elide due to c-api usage"); #endif *cannot = 1; return 0; } } /* * check if in "alhs @op@ orhs" that alhs is a temporary (refcnt == 1) so we * can do in-place operations instead of creating a new temporary * "cannot" is set to true if it cannot be done even with swapped arguments */ static int can_elide_temp(PyObject *olhs, PyObject *orhs, int *cannot) { /* * to be a candidate the array needs to have reference count 1, be an exact * array of a basic type, own its data and size larger than threshold */ PyArrayObject *alhs = (PyArrayObject *)olhs; if (Py_REFCNT(olhs) != 1 || !PyArray_CheckExact(olhs) || !PyArray_ISNUMBER(alhs) || !PyArray_CHKFLAGS(alhs, NPY_ARRAY_OWNDATA) || !PyArray_ISWRITEABLE(alhs) || PyArray_CHKFLAGS(alhs, NPY_ARRAY_WRITEBACKIFCOPY) || PyArray_NBYTES(alhs) < NPY_MIN_ELIDE_BYTES) { return 0; } if (PyArray_CheckExact(orhs) || PyArray_CheckAnyScalar(orhs)) { PyArrayObject * arhs; /* create array from right hand side */ Py_INCREF(orhs); arhs = (PyArrayObject *)PyArray_EnsureArray(orhs); if (arhs == NULL) { return 0; } /* * if rhs is not a scalar dimensions must match * TODO: one could allow broadcasting on equal types */ if (!(PyArray_NDIM(arhs) == 0 || (PyArray_NDIM(arhs) == PyArray_NDIM(alhs) && PyArray_CompareLists(PyArray_DIMS(alhs), PyArray_DIMS(arhs), PyArray_NDIM(arhs))))) { Py_DECREF(arhs); return 0; } /* must be safe to cast (checks values for scalar in rhs) */ if (PyArray_CanCastArrayTo(arhs, PyArray_DESCR(alhs), NPY_SAFE_CASTING)) { Py_DECREF(arhs); return check_callers(cannot); } Py_DECREF(arhs); } return 0; } /* * try eliding a binary op, if commutative is true also try swapped arguments */ NPY_NO_EXPORT int try_binary_elide(PyObject * m1, PyObject * m2, PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2), PyObject ** res, int commutative) { /* set when no elision can be done independent of argument order */ int cannot = 0; if (can_elide_temp(m1, m2, &cannot)) { *res = inplace_op((PyArrayObject *)m1, m2); #if NPY_ELIDE_DEBUG != 0 puts("elided temporary in binary op"); #endif return 1; } else if (commutative && !cannot) { if (can_elide_temp(m2, m1, &cannot)) { *res = inplace_op((PyArrayObject *)m2, m1); #if NPY_ELIDE_DEBUG != 0 puts("elided temporary in commutative binary op"); #endif return 1; } } *res = NULL; return 0; } /* try elide unary temporary */ NPY_NO_EXPORT int can_elide_temp_unary(PyArrayObject * m1) { int cannot; if (Py_REFCNT(m1) != 1 || !PyArray_CheckExact(m1) || !PyArray_ISNUMBER(m1) || !PyArray_CHKFLAGS(m1, NPY_ARRAY_OWNDATA) || !PyArray_ISWRITEABLE(m1) || PyArray_NBYTES(m1) < NPY_MIN_ELIDE_BYTES) { return 0; } if (check_callers(&cannot)) { #if NPY_ELIDE_DEBUG != 0 puts("elided temporary in unary op"); #endif return 1; } else { return 0; } } #else /* unsupported interpreter or missing backtrace */ NPY_NO_EXPORT int can_elide_temp_unary(PyArrayObject * m1) { return 0; } NPY_NO_EXPORT int try_binary_elide(PyArrayObject * m1, PyObject * m2, PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2), PyObject ** res, int commutative) { *res = NULL; return 0; } #endif