summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulian Taylor <jtaylor.debian@googlemail.com>2016-08-30 20:58:29 +0200
committerJulian Taylor <jtaylor.debian@googlemail.com>2017-02-24 19:21:05 +0100
commite6c397b974a72d6970a9ec3a7858355951d43e0a (patch)
treeb53c2fbbed17666a3627d11ce35c64a6e3f8287c
parent5f5ccecbfc116284ed8c8d53cd8b203ceef5f7c7 (diff)
downloadnumpy-e6c397b974a72d6970a9ec3a7858355951d43e0a.tar.gz
ENH: avoid temporary arrays in expressions
Temporary arrays generated in expressions are expensive as the imply extra memory bandwidth which is the bottleneck in most numpy operations. For example: r = -a + b One can avoid temporaries when the reference count of one of the operands is one as it cannot be referenced by any other python code (rvalue in C++ terms). Python itself uses this method to optimize string concatenation. The tricky part is that via the C-API one can use the PyNumber_ directly and skip increasing the reference count when not needed. The python stack cannot be used to determine this as python does not export the stack size, only the current position. To avoid the problem we collect the backtrace of the call until the python frame evaluation function and if it consist exclusively of functions inside python or the c-library we can assume no other library is using the C-API in between. Issues are that the reliability of backtraces is unknown on non-GNU platforms. On GNU and amd64 it should be reliable enough, even without frame pointers, as glibc will use GCCs stack unwinder via the mandatory dwarf stack annotations. Other platforms with backtrace need to be tested. Another problem is that the stack unwinding is very expensive. Unwinding a 100 function deep stack (which is not uncommon from python) takes around 35us, so the elision can only be done for relatively large arrays. Heuristicly it seems to be beneficial around 256kb array sizes (which is about the typical L2 cache size). The performance gain is quite significant around 1.5 to 2 times faster operations with temporaries can be observed. The speed is similar to rewriting the operations to inplace operations manually.
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/setup_common.py4
-rw-r--r--numpy/core/src/multiarray/number.c83
-rw-r--r--numpy/core/src/multiarray/temp_elide.c391
-rw-r--r--numpy/core/src/multiarray/temp_elide.h15
-rw-r--r--numpy/core/tests/test_multiarray.py47
6 files changed, 536 insertions, 5 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index f45042bec..891ba2a19 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -824,6 +824,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'shape.c'),
join('src', 'multiarray', 'scalarapi.c'),
join('src', 'multiarray', 'scalartypes.c.src'),
+ join('src', 'multiarray', 'temp_elide.c'),
join('src', 'multiarray', 'usertypes.c'),
join('src', 'multiarray', 'ucsnarrow.c'),
join('src', 'multiarray', 'vdot.c'),
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 357051cdb..7691a2aeb 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -107,7 +107,8 @@ MANDATORY_FUNCS = ["sin", "cos", "tan", "sinh", "cosh", "tanh", "fabs",
OPTIONAL_STDFUNCS = ["expm1", "log1p", "acosh", "asinh", "atanh",
"rint", "trunc", "exp2", "log2", "hypot", "atan2", "pow",
"copysign", "nextafter", "ftello", "fseeko",
- "strtoll", "strtoull", "cbrt", "strtold_l", "fallocate"]
+ "strtoll", "strtoull", "cbrt", "strtold_l", "fallocate",
+ "backtrace"]
OPTIONAL_HEADERS = [
@@ -116,6 +117,7 @@ OPTIONAL_HEADERS = [
"emmintrin.h", # SSE2
"features.h", # for glibc version linux
"xlocale.h" # see GH#8367
+ "dlfcn.h", # dladdr
]
# optional gcc compiler builtins and their call arguments and optional a
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index fec015a30..c4e675430 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -12,6 +12,7 @@
#include "npy_import.h"
#include "common.h"
#include "number.h"
+#include "temp_elide.h"
/*************************************************************************
**************** Implement Number Protocol ****************************
@@ -353,23 +354,60 @@ PyArray_GenericInplaceUnaryFunction(PyArrayObject *m1, PyObject *op)
}
static PyObject *
+array_inplace_add(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_subtract(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_multiply(PyArrayObject *m1, PyObject *m2);
+#if !defined(NPY_PY3K)
+static PyObject *
+array_inplace_divide(PyArrayObject *m1, PyObject *m2);
+#endif
+static PyObject *
+array_inplace_true_divide(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_floor_divide(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_bitwise_and(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_bitwise_or(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_bitwise_xor(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_left_shift(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_right_shift(PyArrayObject *m1, PyObject *m2);
+
+static PyObject *
array_add(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__add__", "__radd__", 0, nb_add);
+ if (try_binary_elide(m1, m2, &array_inplace_add, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.add);
}
static PyObject *
array_subtract(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__sub__", "__rsub__", 0, nb_subtract);
+ if (try_binary_elide(m1, m2, &array_inplace_subtract, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.subtract);
}
static PyObject *
array_multiply(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__mul__", "__rmul__", 0, nb_multiply);
+ if (try_binary_elide(m1, m2, &array_inplace_multiply, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.multiply);
}
@@ -377,7 +415,11 @@ array_multiply(PyArrayObject *m1, PyObject *m2)
static PyObject *
array_divide(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__div__", "__rdiv__", 0, nb_divide);
+ if (try_binary_elide(m1, m2, &array_inplace_divide, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.divide);
}
#endif
@@ -529,7 +571,7 @@ fast_scalar_power(PyArrayObject *a1, PyObject *o2, int inplace)
return NULL;
}
- if (inplace) {
+ if (inplace || can_elide_temp_unary(a1)) {
return PyArray_GenericInplaceUnaryFunction(a1, fastop);
} else {
return PyArray_GenericUnaryFunction(a1, fastop);
@@ -588,53 +630,82 @@ array_power(PyArrayObject *a1, PyObject *o2, PyObject *NPY_UNUSED(modulo))
static PyObject *
array_negative(PyArrayObject *m1)
{
+ if (can_elide_temp_unary(m1)) {
+ return PyArray_GenericInplaceUnaryFunction(m1, n_ops.negative);
+ }
return PyArray_GenericUnaryFunction(m1, n_ops.negative);
}
static PyObject *
array_absolute(PyArrayObject *m1)
{
+ if (can_elide_temp_unary(m1)) {
+ return PyArray_GenericInplaceUnaryFunction(m1, n_ops.absolute);
+ }
return PyArray_GenericUnaryFunction(m1, n_ops.absolute);
}
static PyObject *
array_invert(PyArrayObject *m1)
{
+ if (can_elide_temp_unary(m1)) {
+ return PyArray_GenericInplaceUnaryFunction(m1, n_ops.invert);
+ }
return PyArray_GenericUnaryFunction(m1, n_ops.invert);
}
static PyObject *
array_left_shift(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__lshift__", "__rlshift__", 0, nb_lshift);
+ if (try_binary_elide(m1, m2, &array_inplace_left_shift, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.left_shift);
}
static PyObject *
array_right_shift(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__rshift__", "__rrshift__", 0, nb_rshift);
+ if (try_binary_elide(m1, m2, &array_inplace_right_shift, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.right_shift);
}
static PyObject *
array_bitwise_and(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__and__", "__rand__", 0, nb_and);
+ if (try_binary_elide(m1, m2, &array_inplace_bitwise_and, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.bitwise_and);
}
static PyObject *
array_bitwise_or(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__or__", "__ror__", 0, nb_or);
+ if (try_binary_elide(m1, m2, &array_inplace_bitwise_or, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.bitwise_or);
}
static PyObject *
array_bitwise_xor(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__xor__", "__rxor__", 0, nb_xor);
+ if (try_binary_elide(m1, m2, &array_inplace_bitwise_xor, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.bitwise_xor);
}
@@ -726,14 +797,24 @@ array_inplace_bitwise_xor(PyArrayObject *m1, PyObject *m2)
static PyObject *
array_floor_divide(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__floordiv__", "__rfloordiv__", 0, nb_floor_divide);
+ if (try_binary_elide(m1, m2, &array_inplace_floor_divide, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.floor_divide);
}
static PyObject *
array_true_divide(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__truediv__", "__rtruediv__", 0, nb_true_divide);
+ if (PyArray_CheckExact(m1) &&
+ (PyArray_ISFLOAT(m1) || PyArray_ISCOMPLEX(m1)) &&
+ try_binary_elide(m1, m2, &array_inplace_true_divide, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.true_divide);
}
diff --git a/numpy/core/src/multiarray/temp_elide.c b/numpy/core/src/multiarray/temp_elide.c
new file mode 100644
index 000000000..5d18e1a08
--- /dev/null
+++ b/numpy/core/src/multiarray/temp_elide.c
@@ -0,0 +1,391 @@
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+
+#define NPY_NUMBER_MAX(a, b) ((a) > (b) ? (a) : (b))
+
+/*
+ * Functions used to try to avoid/elide temporaries in python expressions
+ * of type a + b + b by translating some operations into inplace operations.
+ * This example translates to this bytecode:
+ *
+ * 0 LOAD_FAST 0 (a)
+ * 3 LOAD_FAST 1 (b)
+ * 6 BINARY_ADD
+ * 7 LOAD_FAST 1 (b)
+ * 10 BINARY_ADD
+ *
+ * The two named variables get their reference count increased by the load
+ * instructions so they always have a reference count larger than 1.
+ * The temporary of the first BINARY_ADD on the other hand only has a count of
+ * 1. Only temporaries can have a count of 1 in python so we can use this to
+ * transform the second operation into an inplace operation and not affect the
+ * output of the program.
+ * CPython does the same thing to resize memory instead of copying when doing
+ * string concatenation.
+ * The gain can be very significant (4x-6x) when avoiding the temporary allows
+ * the operation to remain in the cpu caches and can still be 50% faster for
+ * array larger than cpu cache size.
+ *
+ * A complication is that a DSO (dynamic shared object) module (e.g. cython)
+ * could call the PyNumber functions directly on arrays with reference count of
+ * 1.
+ * This is handled by checking the call stack to verify that we have been
+ * called directly from the cpython interpreter.
+ * To achieve this we check that all functions in the callstack until the
+ * cpython frame evaluation function are located in cpython or numpy.
+ * This is an expensive operation so temporaries are only avoided for rather
+ * large arrays.
+ *
+ * A possible future improvement would be to change cpython to give as access
+ * to the top of the stack. Then we could just check that the objects involved
+ * are on the cpython stack instead of checking the function callstack.
+ *
+ * Elision can be applied to all operations that do have inplace variants and
+ * do not change types (addition, subtraction, multiplication, float division,
+ * logical and bitwise operations ...)
+ * For commutative operations (addition, multiplication, ...) if eliding into
+ * the lefthand side fails it can succedd on the righthand side by swapping the
+ * arguments. E.g. b * (a * 2) can be elided by changing it to (2 * a) * b.
+ *
+ * TODO only supports systems with backtrace(), windows can probably be
+ * supported too by using the appropriate windows apis.
+ */
+
+#if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION
+/* 1 prints elided operations, 2 prints stacktraces */
+#define NPY_ELIDE_DEBUG 0
+#define NPY_MAX_STACKSIZE 10
+
+#if PY_VERSION_HEX >= 0x03060000
+/* TODO can pep523 be used to somehow? */
+#define PYFRAMEEVAL_FUNC "_PyEval_EvalFrameDefault"
+#else
+#define PYFRAMEEVAL_FUNC "PyEval_EvalFrameEx"
+#endif
+/*
+ * Heuristic size of the array in bytes at which backtrace overhead generation
+ * becomes less than speed gained by inplace operations. Depends on stack depth
+ * being checked. Measurements with 10 stacks show it getting worthwhile
+ * around 100KiB but to be conservative put it higher around where the L2 cache
+ * spills.
+ */
+#ifndef Py_DEBUG
+#define NPY_MIN_ELIDE_BYTES (256 * 1024)
+#else
+/*
+ * in debug mode always elide but skip scalars as these can convert to 0d array
+ * during in place operations
+ */
+#define NPY_MIN_ELIDE_BYTES (32)
+#endif
+#include <dlfcn.h>
+#include <execinfo.h>
+
+/*
+ * linear search pointer in table
+ * number of pointers is usually quite small but if a performance impact can be
+ * measured this could be converted to a binary search
+ */
+static int
+find_addr(void * addresses[], npy_intp naddr, void * addr)
+{
+ npy_intp j;
+ for (j = 0; j < naddr; j++) {
+ if (addr == addresses[j]) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+check_callers(int * cannot)
+{
+ /*
+ * get base addresses of multiarray and python, check if
+ * backtrace is in these libraries only calling dladdr if a new max address
+ * is found.
+ * When after the initial multiarray stack everything is inside python we
+ * can elide as no C-API user could have messed up the reference counts.
+ * Only check until the python frame evaluation function is found
+ * approx 10us overhead for stack size of 10
+ *
+ * TODO some calls go over scalarmath in umath but we cannot get the base
+ * address of it from multiarraymodule as it is not linked against it
+ */
+ static int init = 0;
+ /*
+ * measured DSO object memory start and end, if an address is located
+ * inside these bounds it is part of that library so we don't need to call
+ * dladdr on it (assuming linear memory)
+ */
+ static void * pos_python_start;
+ static void * pos_python_end;
+ static void * pos_ma_start;
+ static void * pos_ma_end;
+
+ /* known address storage to save dladdr calls */
+ static void * py_addr[64];
+ static void * pyeval_addr[64];
+ static npy_intp n_py_addr = 0;
+ static npy_intp n_pyeval = 0;
+
+ void *buffer[NPY_MAX_STACKSIZE];
+ int i, nptrs;
+ int ok = 0;
+ /* cannot determine callers */
+ if (init == -1) {
+ *cannot = 1;
+ return 0;
+ }
+
+ nptrs = backtrace(buffer, NPY_MAX_STACKSIZE);
+ if (nptrs == 0) {
+ /* complete failure, disable elision */
+ init = -1;
+ *cannot = 1;
+ return 0;
+ }
+
+ /* setup DSO base addresses, ends updated later */
+ if (NPY_UNLIKELY(init == 0)) {
+ Dl_info info;
+ /* get python base address */
+ if (dladdr(&PyNumber_Or, &info)) {
+ pos_python_start = info.dli_fbase;
+ pos_python_end = info.dli_fbase;
+ }
+ else {
+ init = -1;
+ return 0;
+ }
+ /* get multiarray base address */
+ if (dladdr(&PyArray_SetNumericOps, &info)) {
+ pos_ma_start = info.dli_fbase;
+ pos_ma_end = info.dli_fbase;
+ }
+ else {
+ init = -1;
+ return 0;
+ }
+ init = 1;
+ }
+
+ /* loop over callstack addresses to check if they leave numpy or cpython */
+ for (i = 0; i < nptrs; i++) {
+ Dl_info info;
+ int in_python = 0;
+ int in_multiarray = 0;
+#if NPY_ELIDE_DEBUG >= 2
+ dladdr(buffer[i], &info);
+ printf("%s(%p) %s(%p)\n", info.dli_fname, info.dli_fbase,
+ info.dli_sname, info.dli_saddr);
+#endif
+
+ /* check stored DSO boundaries first */
+ if (buffer[i] >= pos_python_start && buffer[i] <= pos_python_end) {
+ in_python = 1;
+ }
+ else if (buffer[i] >= pos_ma_start && buffer[i] <= pos_ma_end) {
+ in_multiarray = 1;
+ }
+
+ /* update DSO boundaries via dladdr if necessary */
+ if (!in_python && !in_multiarray) {
+ if (dladdr(buffer[i], &info) == 0) {
+ init = -1;
+ ok = 0;
+ break;
+ }
+ /* update DSO end */
+ if (info.dli_fbase == pos_python_start) {
+ pos_python_end = NPY_NUMBER_MAX(buffer[i], pos_python_end);
+ in_python = 1;
+ }
+ else if (info.dli_fbase == pos_ma_start) {
+ pos_ma_end = NPY_NUMBER_MAX(buffer[i], pos_ma_end);
+ in_multiarray = 1;
+ }
+ }
+
+ /* no longer in ok libraries and not reached PyEval -> no elide */
+ if (!in_python && !in_multiarray) {
+ ok = 0;
+ break;
+ }
+
+ /* in python check if the frame eval function was reached */
+ if (in_python) {
+ /* if reached eval we are done */
+ if (find_addr(pyeval_addr, n_pyeval, buffer[i])) {
+ ok = 1;
+ break;
+ }
+ /*
+ * check if its some other function, use pointer lookup table to
+ * save expensive dladdr calls
+ */
+ if (find_addr(py_addr, n_py_addr, buffer[i])) {
+ continue;
+ }
+
+ /* new python address, check for PyEvalFrame */
+ if (dladdr(buffer[i], &info) == 0) {
+ init = -1;
+ ok = 0;
+ break;
+ }
+ if (info.dli_sname &&
+ strcmp(info.dli_sname, PYFRAMEEVAL_FUNC) == 0) {
+ if (n_pyeval < sizeof(pyeval_addr) / sizeof(pyeval_addr[0])) {
+ /* store address to not have to dladdr it again */
+ pyeval_addr[n_pyeval++] = buffer[i];
+ }
+ ok = 1;
+ break;
+ }
+ else if (n_py_addr < sizeof(py_addr) / sizeof(py_addr[0])) {
+ /* store other py function to not have to dladdr it again */
+ py_addr[n_py_addr++] = buffer[i];
+ }
+ }
+ }
+
+ /* all stacks after numpy are from python, we can elide */
+ if (ok) {
+ *cannot = 0;
+ return 1;
+ }
+ else {
+#if NPY_ELIDE_DEBUG != 0
+ puts("cannot elide due to c-api usage");
+#endif
+ *cannot = 1;
+ return 0;
+ }
+}
+
+/*
+ * check if in "alhs @op@ orhs" that alhs is a temporary (refcnt == 1) so we
+ * can do inplace operations instead of creating a new temporary
+ * "cannot" is set to true if it cannot be done even with swapped arguments
+ */
+static int
+can_elide_temp(PyArrayObject * alhs, PyObject * orhs, int * cannot)
+{
+ /*
+ * to be a candidate the array needs to have reference count 1, be an exact
+ * array of a basic type, own its data and size larger than threshold
+ */
+ if (Py_REFCNT(alhs) != 1 || !PyArray_CheckExact(alhs) ||
+ PyArray_DESCR(alhs)->type_num >= NPY_OBJECT ||
+ !(PyArray_FLAGS(alhs) & NPY_ARRAY_OWNDATA) ||
+ PyArray_NBYTES(alhs) < NPY_MIN_ELIDE_BYTES) {
+ return 0;
+ }
+ if (PyArray_CheckExact(orhs) ||
+ PyArray_CheckAnyScalar(orhs)) {
+ PyArrayObject * arhs;
+
+ /* create array from right hand side */
+ Py_INCREF(orhs);
+ arhs = (PyArrayObject *)PyArray_EnsureArray(orhs);
+ if (arhs == NULL) {
+ return 0;
+ }
+
+ /*
+ * if rhs is not a scalar dimensions must match
+ * TODO: one could allow broadcasting on equal types
+ */
+ if (!(PyArray_NDIM(arhs) == 0 ||
+ (PyArray_NDIM(arhs) == PyArray_NDIM(alhs) &&
+ PyArray_CompareLists(PyArray_DIMS(alhs), PyArray_DIMS(arhs),
+ PyArray_NDIM(arhs))))) {
+ Py_DECREF(arhs);
+ return 0;
+ }
+
+ /* must be safe to cast (checks values for scalar in rhs) */
+ if (PyArray_CanCastArrayTo(arhs, PyArray_DESCR(alhs),
+ NPY_SAFE_CASTING)) {
+ Py_DECREF(arhs);
+ return check_callers(cannot);
+ }
+ Py_DECREF(arhs);
+ }
+
+ return 0;
+}
+
+/*
+ * try eliding a binary op, if commutative is true also try swapped arguments
+ */
+NPY_NO_EXPORT int
+try_binary_elide(PyArrayObject * m1, PyObject * m2,
+ PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
+ PyObject ** res, int commutative)
+{
+ /* set when no elision can be done independent of argument order */
+ int cannot = 0;
+ if (can_elide_temp(m1, m2, &cannot)) {
+ *res = inplace_op(m1, m2);
+#if NPY_ELIDE_DEBUG != 0
+ puts("elided temporary in binary op");
+#endif
+ return 1;
+ }
+ else if (commutative && !cannot) {
+ if (can_elide_temp((PyArrayObject *)m2, (PyObject *)m1, &cannot)) {
+ *res = inplace_op((PyArrayObject *)m2, (PyObject *)m1);
+#if NPY_ELIDE_DEBUG != 0
+ puts("elided temporary in commutative binary op");
+#endif
+ return 1;
+ }
+ }
+ *res = NULL;
+ return 0;
+}
+
+/* try elide unary temporary */
+NPY_NO_EXPORT int
+can_elide_temp_unary(PyArrayObject * m1)
+{
+ int cannot;
+ if (Py_REFCNT(m1) != 1 || !PyArray_CheckExact(m1) ||
+ PyArray_DESCR(m1)->type_num == NPY_VOID ||
+ !(PyArray_FLAGS(m1) & NPY_ARRAY_OWNDATA) ||
+ PyArray_NBYTES(m1) < NPY_MIN_ELIDE_BYTES) {
+ return 0;
+ }
+ if (check_callers(&cannot)) {
+#if NPY_ELIDE_DEBUG != 0
+ puts("elided temporary in unary op");
+#endif
+ return 1;
+ }
+ else {
+ return 0;
+ }
+}
+#else /* unsupported interpreter or missing backtrace */
+NPY_NO_EXPORT int
+can_elide_temp_unary(PyArrayObject * m1)
+{
+ return 0;
+}
+
+NPY_NO_EXPORT int
+try_binary_elide(PyArrayObject * m1, PyObject * m2,
+ PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
+ PyObject ** res, int commutative)
+{
+ *res = NULL;
+ return 0;
+}
+#endif
diff --git a/numpy/core/src/multiarray/temp_elide.h b/numpy/core/src/multiarray/temp_elide.h
new file mode 100644
index 000000000..d073adf28
--- /dev/null
+++ b/numpy/core/src/multiarray/temp_elide.h
@@ -0,0 +1,15 @@
+#ifndef _NPY_ARRAY_TEMP_AVOID_H_
+#define _NPY_ARRAY_TEMP_AVOID_H_
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include <numpy/ndarraytypes.h>
+
+NPY_NO_EXPORT int
+can_elide_temp_unary(PyArrayObject * m1);
+
+NPY_NO_EXPORT int
+try_binary_elide(PyArrayObject * m1, PyObject * m2,
+ PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
+ PyObject ** res, int commutative);
+
+#endif
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index fa5051ba7..b780c5558 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -2743,8 +2743,9 @@ class TestBinop(object):
# d = input.copy() # refcount 1
# return d, d + d # PyNumber_Add without increasing refcount
from numpy.core.multiarray_tests import incref_elide
- d = np.ones(5)
+ d = np.ones(100000)
orig, res = incref_elide(d)
+ d + d
# the return original should not be changed to an inplace operation
assert_array_equal(orig, d)
assert_array_equal(res, d + d)
@@ -2758,12 +2759,52 @@ class TestBinop(object):
# return l[4] + l[4] # PyNumber_Add without increasing refcount
from numpy.core.multiarray_tests import incref_elide_l
# padding with 1 makes sure the object on the stack is not overwriten
- l = [1, 1, 1, 1, np.ones(5)]
+ l = [1, 1, 1, 1, np.ones(100000)]
res = incref_elide_l(l)
# the return original should not be changed to an inplace operation
- assert_array_equal(l[4], np.ones(5))
+ assert_array_equal(l[4], np.ones(100000))
assert_array_equal(res, l[4] + l[4])
+ def test_temporary_with_cast(self):
+ # check that we don't elide into a temporary which would need casting
+ d = np.ones(200000, dtype=np.int64)
+ assert_equal(((d + d) + 2**222).dtype, np.dtype('O'))
+
+ r = ((d + d) / 2)
+ assert_equal(r.dtype, np.dtype('f8'))
+
+ r = np.true_divide((d + d), 2)
+ assert_equal(r.dtype, np.dtype('f8'))
+
+ r = ((d + d) / 2.)
+ assert_equal(r.dtype, np.dtype('f8'))
+
+ r = ((d + d) // 2)
+ assert_equal(r.dtype, np.dtype(np.int64))
+
+ # commutative elision into the astype result
+ f = np.ones(100000, dtype=np.float32)
+ assert_equal(((f + f) + f.astype(np.float64)).dtype, np.dtype('f8'))
+
+ # no elision into f + f
+ d = f.astype(np.float64)
+ assert_equal(((f + f) + d).dtype, np.dtype('f8'))
+
+ def test_elide_broadcast(self):
+ # test no elision on broadcast to higher dimension
+ # only triggers elision code path in debug mode as triggering it in
+ # normal mode needs 256kb large matching dimension, so a lot of memory
+ d = np.ones((2000, 1), dtype=int)
+ b = np.ones((2000), dtype=np.bool)
+ r = (1 - d) + b
+ assert_equal(r, 1)
+ assert_equal(r.shape, (2000, 2000))
+
+ def test_elide_scalar(self):
+ # check inplace op does not create ndarray from scalars
+ a = np.bool_()
+ assert_(type(~(a & a)) is np.bool_)
+
def test_ufunc_override_rop_precedence(self):
# 2016-01-29: NUMPY_UFUNC_DISABLED
return