summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2017-02-24 18:18:17 -0700
committerGitHub <noreply@github.com>2017-02-24 18:18:17 -0700
commit7bc881d053d378c3653dfe0c01c5c7ac4b0c6cdb (patch)
treebb4401b5da7b9ad4aa057781243d8715d65377eb /numpy
parent23ac8554b680ae86baf5345af6f18097afe591ce (diff)
parentc3e24b2f2860ce1912f578cf996184198a3cfd25 (diff)
downloadnumpy-7bc881d053d378c3653dfe0c01c5c7ac4b0c6cdb.tar.gz
Merge pull request #7997 from juliantaylor/temporary-elide
ENH: avoid temporary arrays in expressions (again)
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/setup_common.py4
-rw-r--r--numpy/core/src/multiarray/number.c83
-rw-r--r--numpy/core/src/multiarray/temp_elide.c391
-rw-r--r--numpy/core/src/multiarray/temp_elide.h15
-rw-r--r--numpy/core/tests/test_multiarray.py47
6 files changed, 536 insertions, 5 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index f45042bec..891ba2a19 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -824,6 +824,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'shape.c'),
join('src', 'multiarray', 'scalarapi.c'),
join('src', 'multiarray', 'scalartypes.c.src'),
+ join('src', 'multiarray', 'temp_elide.c'),
join('src', 'multiarray', 'usertypes.c'),
join('src', 'multiarray', 'ucsnarrow.c'),
join('src', 'multiarray', 'vdot.c'),
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 357051cdb..7691a2aeb 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -107,7 +107,8 @@ MANDATORY_FUNCS = ["sin", "cos", "tan", "sinh", "cosh", "tanh", "fabs",
OPTIONAL_STDFUNCS = ["expm1", "log1p", "acosh", "asinh", "atanh",
"rint", "trunc", "exp2", "log2", "hypot", "atan2", "pow",
"copysign", "nextafter", "ftello", "fseeko",
- "strtoll", "strtoull", "cbrt", "strtold_l", "fallocate"]
+ "strtoll", "strtoull", "cbrt", "strtold_l", "fallocate",
+ "backtrace"]
OPTIONAL_HEADERS = [
@@ -116,6 +117,7 @@ OPTIONAL_HEADERS = [
"emmintrin.h", # SSE2
"features.h", # for glibc version linux
"xlocale.h" # see GH#8367
+ "dlfcn.h", # dladdr
]
# optional gcc compiler builtins and their call arguments and optional a
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index fec015a30..c4e675430 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -12,6 +12,7 @@
#include "npy_import.h"
#include "common.h"
#include "number.h"
+#include "temp_elide.h"
/*************************************************************************
**************** Implement Number Protocol ****************************
@@ -353,23 +354,60 @@ PyArray_GenericInplaceUnaryFunction(PyArrayObject *m1, PyObject *op)
}
static PyObject *
+array_inplace_add(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_subtract(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_multiply(PyArrayObject *m1, PyObject *m2);
+#if !defined(NPY_PY3K)
+static PyObject *
+array_inplace_divide(PyArrayObject *m1, PyObject *m2);
+#endif
+static PyObject *
+array_inplace_true_divide(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_floor_divide(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_bitwise_and(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_bitwise_or(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_bitwise_xor(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_left_shift(PyArrayObject *m1, PyObject *m2);
+static PyObject *
+array_inplace_right_shift(PyArrayObject *m1, PyObject *m2);
+
+static PyObject *
array_add(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__add__", "__radd__", 0, nb_add);
+ if (try_binary_elide(m1, m2, &array_inplace_add, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.add);
}
static PyObject *
array_subtract(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__sub__", "__rsub__", 0, nb_subtract);
+ if (try_binary_elide(m1, m2, &array_inplace_subtract, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.subtract);
}
static PyObject *
array_multiply(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__mul__", "__rmul__", 0, nb_multiply);
+ if (try_binary_elide(m1, m2, &array_inplace_multiply, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.multiply);
}
@@ -377,7 +415,11 @@ array_multiply(PyArrayObject *m1, PyObject *m2)
static PyObject *
array_divide(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__div__", "__rdiv__", 0, nb_divide);
+ if (try_binary_elide(m1, m2, &array_inplace_divide, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.divide);
}
#endif
@@ -529,7 +571,7 @@ fast_scalar_power(PyArrayObject *a1, PyObject *o2, int inplace)
return NULL;
}
- if (inplace) {
+ if (inplace || can_elide_temp_unary(a1)) {
return PyArray_GenericInplaceUnaryFunction(a1, fastop);
} else {
return PyArray_GenericUnaryFunction(a1, fastop);
@@ -588,53 +630,82 @@ array_power(PyArrayObject *a1, PyObject *o2, PyObject *NPY_UNUSED(modulo))
static PyObject *
array_negative(PyArrayObject *m1)
{
+ if (can_elide_temp_unary(m1)) {
+ return PyArray_GenericInplaceUnaryFunction(m1, n_ops.negative);
+ }
return PyArray_GenericUnaryFunction(m1, n_ops.negative);
}
static PyObject *
array_absolute(PyArrayObject *m1)
{
+ if (can_elide_temp_unary(m1)) {
+ return PyArray_GenericInplaceUnaryFunction(m1, n_ops.absolute);
+ }
return PyArray_GenericUnaryFunction(m1, n_ops.absolute);
}
static PyObject *
array_invert(PyArrayObject *m1)
{
+ if (can_elide_temp_unary(m1)) {
+ return PyArray_GenericInplaceUnaryFunction(m1, n_ops.invert);
+ }
return PyArray_GenericUnaryFunction(m1, n_ops.invert);
}
static PyObject *
array_left_shift(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__lshift__", "__rlshift__", 0, nb_lshift);
+ if (try_binary_elide(m1, m2, &array_inplace_left_shift, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.left_shift);
}
static PyObject *
array_right_shift(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__rshift__", "__rrshift__", 0, nb_rshift);
+ if (try_binary_elide(m1, m2, &array_inplace_right_shift, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.right_shift);
}
static PyObject *
array_bitwise_and(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__and__", "__rand__", 0, nb_and);
+ if (try_binary_elide(m1, m2, &array_inplace_bitwise_and, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.bitwise_and);
}
static PyObject *
array_bitwise_or(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__or__", "__ror__", 0, nb_or);
+ if (try_binary_elide(m1, m2, &array_inplace_bitwise_or, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.bitwise_or);
}
static PyObject *
array_bitwise_xor(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__xor__", "__rxor__", 0, nb_xor);
+ if (try_binary_elide(m1, m2, &array_inplace_bitwise_xor, &res, 1)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.bitwise_xor);
}
@@ -726,14 +797,24 @@ array_inplace_bitwise_xor(PyArrayObject *m1, PyObject *m2)
static PyObject *
array_floor_divide(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__floordiv__", "__rfloordiv__", 0, nb_floor_divide);
+ if (try_binary_elide(m1, m2, &array_inplace_floor_divide, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.floor_divide);
}
static PyObject *
array_true_divide(PyArrayObject *m1, PyObject *m2)
{
+ PyObject * res;
GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__truediv__", "__rtruediv__", 0, nb_true_divide);
+ if (PyArray_CheckExact(m1) &&
+ (PyArray_ISFLOAT(m1) || PyArray_ISCOMPLEX(m1)) &&
+ try_binary_elide(m1, m2, &array_inplace_true_divide, &res, 0)) {
+ return res;
+ }
return PyArray_GenericBinaryFunction(m1, m2, n_ops.true_divide);
}
diff --git a/numpy/core/src/multiarray/temp_elide.c b/numpy/core/src/multiarray/temp_elide.c
new file mode 100644
index 000000000..5d18e1a08
--- /dev/null
+++ b/numpy/core/src/multiarray/temp_elide.c
@@ -0,0 +1,391 @@
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+
+#define NPY_NUMBER_MAX(a, b) ((a) > (b) ? (a) : (b))
+
+/*
+ * Functions used to try to avoid/elide temporaries in python expressions
+ * of type a + b + b by translating some operations into inplace operations.
+ * This example translates to this bytecode:
+ *
+ * 0 LOAD_FAST 0 (a)
+ * 3 LOAD_FAST 1 (b)
+ * 6 BINARY_ADD
+ * 7 LOAD_FAST 1 (b)
+ * 10 BINARY_ADD
+ *
+ * The two named variables get their reference count increased by the load
+ * instructions so they always have a reference count larger than 1.
+ * The temporary of the first BINARY_ADD on the other hand only has a count of
+ * 1. Only temporaries can have a count of 1 in python so we can use this to
+ * transform the second operation into an inplace operation and not affect the
+ * output of the program.
+ * CPython does the same thing to resize memory instead of copying when doing
+ * string concatenation.
+ * The gain can be very significant (4x-6x) when avoiding the temporary allows
+ * the operation to remain in the cpu caches and can still be 50% faster for
+ * array larger than cpu cache size.
+ *
+ * A complication is that a DSO (dynamic shared object) module (e.g. cython)
+ * could call the PyNumber functions directly on arrays with reference count of
+ * 1.
+ * This is handled by checking the call stack to verify that we have been
+ * called directly from the cpython interpreter.
+ * To achieve this we check that all functions in the callstack until the
+ * cpython frame evaluation function are located in cpython or numpy.
+ * This is an expensive operation so temporaries are only avoided for rather
+ * large arrays.
+ *
+ * A possible future improvement would be to change cpython to give as access
+ * to the top of the stack. Then we could just check that the objects involved
+ * are on the cpython stack instead of checking the function callstack.
+ *
+ * Elision can be applied to all operations that do have inplace variants and
+ * do not change types (addition, subtraction, multiplication, float division,
+ * logical and bitwise operations ...)
+ * For commutative operations (addition, multiplication, ...) if eliding into
+ * the lefthand side fails it can succedd on the righthand side by swapping the
+ * arguments. E.g. b * (a * 2) can be elided by changing it to (2 * a) * b.
+ *
+ * TODO only supports systems with backtrace(), windows can probably be
+ * supported too by using the appropriate windows apis.
+ */
+
+#if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION
+/* 1 prints elided operations, 2 prints stacktraces */
+#define NPY_ELIDE_DEBUG 0
+#define NPY_MAX_STACKSIZE 10
+
+#if PY_VERSION_HEX >= 0x03060000
+/* TODO can pep523 be used to somehow? */
+#define PYFRAMEEVAL_FUNC "_PyEval_EvalFrameDefault"
+#else
+#define PYFRAMEEVAL_FUNC "PyEval_EvalFrameEx"
+#endif
+/*
+ * Heuristic size of the array in bytes at which backtrace overhead generation
+ * becomes less than speed gained by inplace operations. Depends on stack depth
+ * being checked. Measurements with 10 stacks show it getting worthwhile
+ * around 100KiB but to be conservative put it higher around where the L2 cache
+ * spills.
+ */
+#ifndef Py_DEBUG
+#define NPY_MIN_ELIDE_BYTES (256 * 1024)
+#else
+/*
+ * in debug mode always elide but skip scalars as these can convert to 0d array
+ * during in place operations
+ */
+#define NPY_MIN_ELIDE_BYTES (32)
+#endif
+#include <dlfcn.h>
+#include <execinfo.h>
+
+/*
+ * linear search pointer in table
+ * number of pointers is usually quite small but if a performance impact can be
+ * measured this could be converted to a binary search
+ */
+static int
+find_addr(void * addresses[], npy_intp naddr, void * addr)
+{
+ npy_intp j;
+ for (j = 0; j < naddr; j++) {
+ if (addr == addresses[j]) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+check_callers(int * cannot)
+{
+ /*
+ * get base addresses of multiarray and python, check if
+ * backtrace is in these libraries only calling dladdr if a new max address
+ * is found.
+ * When after the initial multiarray stack everything is inside python we
+ * can elide as no C-API user could have messed up the reference counts.
+ * Only check until the python frame evaluation function is found
+ * approx 10us overhead for stack size of 10
+ *
+ * TODO some calls go over scalarmath in umath but we cannot get the base
+ * address of it from multiarraymodule as it is not linked against it
+ */
+ static int init = 0;
+ /*
+ * measured DSO object memory start and end, if an address is located
+ * inside these bounds it is part of that library so we don't need to call
+ * dladdr on it (assuming linear memory)
+ */
+ static void * pos_python_start;
+ static void * pos_python_end;
+ static void * pos_ma_start;
+ static void * pos_ma_end;
+
+ /* known address storage to save dladdr calls */
+ static void * py_addr[64];
+ static void * pyeval_addr[64];
+ static npy_intp n_py_addr = 0;
+ static npy_intp n_pyeval = 0;
+
+ void *buffer[NPY_MAX_STACKSIZE];
+ int i, nptrs;
+ int ok = 0;
+ /* cannot determine callers */
+ if (init == -1) {
+ *cannot = 1;
+ return 0;
+ }
+
+ nptrs = backtrace(buffer, NPY_MAX_STACKSIZE);
+ if (nptrs == 0) {
+ /* complete failure, disable elision */
+ init = -1;
+ *cannot = 1;
+ return 0;
+ }
+
+ /* setup DSO base addresses, ends updated later */
+ if (NPY_UNLIKELY(init == 0)) {
+ Dl_info info;
+ /* get python base address */
+ if (dladdr(&PyNumber_Or, &info)) {
+ pos_python_start = info.dli_fbase;
+ pos_python_end = info.dli_fbase;
+ }
+ else {
+ init = -1;
+ return 0;
+ }
+ /* get multiarray base address */
+ if (dladdr(&PyArray_SetNumericOps, &info)) {
+ pos_ma_start = info.dli_fbase;
+ pos_ma_end = info.dli_fbase;
+ }
+ else {
+ init = -1;
+ return 0;
+ }
+ init = 1;
+ }
+
+ /* loop over callstack addresses to check if they leave numpy or cpython */
+ for (i = 0; i < nptrs; i++) {
+ Dl_info info;
+ int in_python = 0;
+ int in_multiarray = 0;
+#if NPY_ELIDE_DEBUG >= 2
+ dladdr(buffer[i], &info);
+ printf("%s(%p) %s(%p)\n", info.dli_fname, info.dli_fbase,
+ info.dli_sname, info.dli_saddr);
+#endif
+
+ /* check stored DSO boundaries first */
+ if (buffer[i] >= pos_python_start && buffer[i] <= pos_python_end) {
+ in_python = 1;
+ }
+ else if (buffer[i] >= pos_ma_start && buffer[i] <= pos_ma_end) {
+ in_multiarray = 1;
+ }
+
+ /* update DSO boundaries via dladdr if necessary */
+ if (!in_python && !in_multiarray) {
+ if (dladdr(buffer[i], &info) == 0) {
+ init = -1;
+ ok = 0;
+ break;
+ }
+ /* update DSO end */
+ if (info.dli_fbase == pos_python_start) {
+ pos_python_end = NPY_NUMBER_MAX(buffer[i], pos_python_end);
+ in_python = 1;
+ }
+ else if (info.dli_fbase == pos_ma_start) {
+ pos_ma_end = NPY_NUMBER_MAX(buffer[i], pos_ma_end);
+ in_multiarray = 1;
+ }
+ }
+
+ /* no longer in ok libraries and not reached PyEval -> no elide */
+ if (!in_python && !in_multiarray) {
+ ok = 0;
+ break;
+ }
+
+ /* in python check if the frame eval function was reached */
+ if (in_python) {
+ /* if reached eval we are done */
+ if (find_addr(pyeval_addr, n_pyeval, buffer[i])) {
+ ok = 1;
+ break;
+ }
+ /*
+ * check if its some other function, use pointer lookup table to
+ * save expensive dladdr calls
+ */
+ if (find_addr(py_addr, n_py_addr, buffer[i])) {
+ continue;
+ }
+
+ /* new python address, check for PyEvalFrame */
+ if (dladdr(buffer[i], &info) == 0) {
+ init = -1;
+ ok = 0;
+ break;
+ }
+ if (info.dli_sname &&
+ strcmp(info.dli_sname, PYFRAMEEVAL_FUNC) == 0) {
+ if (n_pyeval < sizeof(pyeval_addr) / sizeof(pyeval_addr[0])) {
+ /* store address to not have to dladdr it again */
+ pyeval_addr[n_pyeval++] = buffer[i];
+ }
+ ok = 1;
+ break;
+ }
+ else if (n_py_addr < sizeof(py_addr) / sizeof(py_addr[0])) {
+ /* store other py function to not have to dladdr it again */
+ py_addr[n_py_addr++] = buffer[i];
+ }
+ }
+ }
+
+ /* all stacks after numpy are from python, we can elide */
+ if (ok) {
+ *cannot = 0;
+ return 1;
+ }
+ else {
+#if NPY_ELIDE_DEBUG != 0
+ puts("cannot elide due to c-api usage");
+#endif
+ *cannot = 1;
+ return 0;
+ }
+}
+
+/*
+ * check if in "alhs @op@ orhs" that alhs is a temporary (refcnt == 1) so we
+ * can do inplace operations instead of creating a new temporary
+ * "cannot" is set to true if it cannot be done even with swapped arguments
+ */
+static int
+can_elide_temp(PyArrayObject * alhs, PyObject * orhs, int * cannot)
+{
+ /*
+ * to be a candidate the array needs to have reference count 1, be an exact
+ * array of a basic type, own its data and size larger than threshold
+ */
+ if (Py_REFCNT(alhs) != 1 || !PyArray_CheckExact(alhs) ||
+ PyArray_DESCR(alhs)->type_num >= NPY_OBJECT ||
+ !(PyArray_FLAGS(alhs) & NPY_ARRAY_OWNDATA) ||
+ PyArray_NBYTES(alhs) < NPY_MIN_ELIDE_BYTES) {
+ return 0;
+ }
+ if (PyArray_CheckExact(orhs) ||
+ PyArray_CheckAnyScalar(orhs)) {
+ PyArrayObject * arhs;
+
+ /* create array from right hand side */
+ Py_INCREF(orhs);
+ arhs = (PyArrayObject *)PyArray_EnsureArray(orhs);
+ if (arhs == NULL) {
+ return 0;
+ }
+
+ /*
+ * if rhs is not a scalar dimensions must match
+ * TODO: one could allow broadcasting on equal types
+ */
+ if (!(PyArray_NDIM(arhs) == 0 ||
+ (PyArray_NDIM(arhs) == PyArray_NDIM(alhs) &&
+ PyArray_CompareLists(PyArray_DIMS(alhs), PyArray_DIMS(arhs),
+ PyArray_NDIM(arhs))))) {
+ Py_DECREF(arhs);
+ return 0;
+ }
+
+ /* must be safe to cast (checks values for scalar in rhs) */
+ if (PyArray_CanCastArrayTo(arhs, PyArray_DESCR(alhs),
+ NPY_SAFE_CASTING)) {
+ Py_DECREF(arhs);
+ return check_callers(cannot);
+ }
+ Py_DECREF(arhs);
+ }
+
+ return 0;
+}
+
+/*
+ * try eliding a binary op, if commutative is true also try swapped arguments
+ */
+NPY_NO_EXPORT int
+try_binary_elide(PyArrayObject * m1, PyObject * m2,
+ PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
+ PyObject ** res, int commutative)
+{
+ /* set when no elision can be done independent of argument order */
+ int cannot = 0;
+ if (can_elide_temp(m1, m2, &cannot)) {
+ *res = inplace_op(m1, m2);
+#if NPY_ELIDE_DEBUG != 0
+ puts("elided temporary in binary op");
+#endif
+ return 1;
+ }
+ else if (commutative && !cannot) {
+ if (can_elide_temp((PyArrayObject *)m2, (PyObject *)m1, &cannot)) {
+ *res = inplace_op((PyArrayObject *)m2, (PyObject *)m1);
+#if NPY_ELIDE_DEBUG != 0
+ puts("elided temporary in commutative binary op");
+#endif
+ return 1;
+ }
+ }
+ *res = NULL;
+ return 0;
+}
+
+/* try elide unary temporary */
+NPY_NO_EXPORT int
+can_elide_temp_unary(PyArrayObject * m1)
+{
+ int cannot;
+ if (Py_REFCNT(m1) != 1 || !PyArray_CheckExact(m1) ||
+ PyArray_DESCR(m1)->type_num == NPY_VOID ||
+ !(PyArray_FLAGS(m1) & NPY_ARRAY_OWNDATA) ||
+ PyArray_NBYTES(m1) < NPY_MIN_ELIDE_BYTES) {
+ return 0;
+ }
+ if (check_callers(&cannot)) {
+#if NPY_ELIDE_DEBUG != 0
+ puts("elided temporary in unary op");
+#endif
+ return 1;
+ }
+ else {
+ return 0;
+ }
+}
+#else /* unsupported interpreter or missing backtrace */
+NPY_NO_EXPORT int
+can_elide_temp_unary(PyArrayObject * m1)
+{
+ return 0;
+}
+
+NPY_NO_EXPORT int
+try_binary_elide(PyArrayObject * m1, PyObject * m2,
+ PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
+ PyObject ** res, int commutative)
+{
+ *res = NULL;
+ return 0;
+}
+#endif
diff --git a/numpy/core/src/multiarray/temp_elide.h b/numpy/core/src/multiarray/temp_elide.h
new file mode 100644
index 000000000..d073adf28
--- /dev/null
+++ b/numpy/core/src/multiarray/temp_elide.h
@@ -0,0 +1,15 @@
+#ifndef _NPY_ARRAY_TEMP_AVOID_H_
+#define _NPY_ARRAY_TEMP_AVOID_H_
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include <numpy/ndarraytypes.h>
+
+NPY_NO_EXPORT int
+can_elide_temp_unary(PyArrayObject * m1);
+
+NPY_NO_EXPORT int
+try_binary_elide(PyArrayObject * m1, PyObject * m2,
+ PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
+ PyObject ** res, int commutative);
+
+#endif
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index fa5051ba7..b780c5558 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -2743,8 +2743,9 @@ class TestBinop(object):
# d = input.copy() # refcount 1
# return d, d + d # PyNumber_Add without increasing refcount
from numpy.core.multiarray_tests import incref_elide
- d = np.ones(5)
+ d = np.ones(100000)
orig, res = incref_elide(d)
+ d + d
# the return original should not be changed to an inplace operation
assert_array_equal(orig, d)
assert_array_equal(res, d + d)
@@ -2758,12 +2759,52 @@ class TestBinop(object):
# return l[4] + l[4] # PyNumber_Add without increasing refcount
from numpy.core.multiarray_tests import incref_elide_l
# padding with 1 makes sure the object on the stack is not overwriten
- l = [1, 1, 1, 1, np.ones(5)]
+ l = [1, 1, 1, 1, np.ones(100000)]
res = incref_elide_l(l)
# the return original should not be changed to an inplace operation
- assert_array_equal(l[4], np.ones(5))
+ assert_array_equal(l[4], np.ones(100000))
assert_array_equal(res, l[4] + l[4])
+ def test_temporary_with_cast(self):
+ # check that we don't elide into a temporary which would need casting
+ d = np.ones(200000, dtype=np.int64)
+ assert_equal(((d + d) + 2**222).dtype, np.dtype('O'))
+
+ r = ((d + d) / 2)
+ assert_equal(r.dtype, np.dtype('f8'))
+
+ r = np.true_divide((d + d), 2)
+ assert_equal(r.dtype, np.dtype('f8'))
+
+ r = ((d + d) / 2.)
+ assert_equal(r.dtype, np.dtype('f8'))
+
+ r = ((d + d) // 2)
+ assert_equal(r.dtype, np.dtype(np.int64))
+
+ # commutative elision into the astype result
+ f = np.ones(100000, dtype=np.float32)
+ assert_equal(((f + f) + f.astype(np.float64)).dtype, np.dtype('f8'))
+
+ # no elision into f + f
+ d = f.astype(np.float64)
+ assert_equal(((f + f) + d).dtype, np.dtype('f8'))
+
+ def test_elide_broadcast(self):
+ # test no elision on broadcast to higher dimension
+ # only triggers elision code path in debug mode as triggering it in
+ # normal mode needs 256kb large matching dimension, so a lot of memory
+ d = np.ones((2000, 1), dtype=int)
+ b = np.ones((2000), dtype=np.bool)
+ r = (1 - d) + b
+ assert_equal(r, 1)
+ assert_equal(r.shape, (2000, 2000))
+
+ def test_elide_scalar(self):
+ # check inplace op does not create ndarray from scalars
+ a = np.bool_()
+ assert_(type(~(a & a)) is np.bool_)
+
def test_ufunc_override_rop_precedence(self):
# 2016-01-29: NUMPY_UFUNC_DISABLED
return