summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/release/1.11.0-notes.rst3
-rw-r--r--doc/source/reference/routines.other.rst9
-rw-r--r--numpy/add_newdocs.py30
-rw-r--r--numpy/core/_internal.py4
-rw-r--r--numpy/core/bento.info3
-rw-r--r--numpy/core/bscript1
-rw-r--r--numpy/core/function_base.py47
-rw-r--r--numpy/core/numeric.py9
-rw-r--r--numpy/core/setup.py8
-rw-r--r--numpy/core/src/multiarray/array_assign.c27
-rw-r--r--numpy/core/src/multiarray/arrayobject.c1
-rw-r--r--numpy/core/src/multiarray/common.c33
-rw-r--r--numpy/core/src/multiarray/common.h5
-rw-r--r--numpy/core/src/multiarray/getset.c1
-rw-r--r--numpy/core/src/multiarray/multiarray_tests.c.src649
-rw-r--r--numpy/core/src/multiarray/multiarraymodule.c82
-rw-r--r--numpy/core/src/multiarray/multiarraymodule_onefile.c1
-rw-r--r--numpy/core/src/private/mem_overlap.c905
-rw-r--r--numpy/core/src/private/mem_overlap.h50
-rw-r--r--numpy/core/src/private/npy_extint128.h317
-rw-r--r--numpy/core/tests/test_extint128.py225
-rw-r--r--numpy/core/tests/test_mem_overlap.py486
-rw-r--r--numpy/core/tests/test_multiarray_assignment.py84
23 files changed, 2816 insertions, 164 deletions
diff --git a/doc/release/1.11.0-notes.rst b/doc/release/1.11.0-notes.rst
index cb1b19a92..3ca77eea5 100644
--- a/doc/release/1.11.0-notes.rst
+++ b/doc/release/1.11.0-notes.rst
@@ -29,6 +29,9 @@ as the argument to 'bins' results in the corresponding estimator being used.
has been added, converting the previous vbench-based one. You can run the suite locally
via ``python runtests.py --bench``. For more details, see ``benchmarks/README.rst``.
+* A new function ``np.shares_memory`` that can check exactly whether two
+arrays have memory overlap is added. ``np.may_share_memory`` also now
+has an option to spend more effort to reduce false positives.
Improvements
============
diff --git a/doc/source/reference/routines.other.rst b/doc/source/reference/routines.other.rst
index 354f45733..a3a1f8a06 100644
--- a/doc/source/reference/routines.other.rst
+++ b/doc/source/reference/routines.other.rst
@@ -22,3 +22,12 @@ Performance tuning
restoredot
setbufsize
getbufsize
+
+Memory ranges
+-------------
+
+.. autosummary::
+ :toctree: generated/
+
+ shares_memory
+ may_share_memory
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 195789a39..607e28a28 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -3784,24 +3784,40 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('min',
"""))
-add_newdoc('numpy.core.multiarray', 'may_share_memory',
+add_newdoc('numpy.core.multiarray', 'shares_memory',
"""
- Determine if two arrays can share memory
+ shares_memory(a, b, max_work=None)
- The memory-bounds of a and b are computed. If they overlap then
- this function returns True. Otherwise, it returns False.
-
- A return of True does not necessarily mean that the two arrays
- share any element. It just means that they *might*.
+ Determine if two arrays share memory
Parameters
----------
a, b : ndarray
+ Input arrays
+ max_work : int, optional
+ Effort to spend on solving the overlap problem (maximum number
+ of candidate solutions to consider). The following special
+ values are recognized:
+
+ max_work=MAY_SHARE_EXACT (default)
+ The problem is solved exactly. In this case, the function returns
+ True only if there is an element shared between the arrays.
+ max_work=MAY_SHARE_BOUNDS
+ Only the memory bounds of a and b are checked.
+
+ Raises
+ ------
+ numpy.TooHardError
+ Exceeded max_work.
Returns
-------
out : bool
+ See Also
+ --------
+ may_share_memory
+
Examples
--------
>>> np.may_share_memory(np.array([1,2]), np.array([5,8,9]))
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index bf492d105..879f4a224 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -759,3 +759,7 @@ def _gcd(a, b):
while b:
a, b = b, a % b
return a
+
+# Exception used in shares_memory()
+class TooHardError(RuntimeError):
+ pass
diff --git a/numpy/core/bento.info b/numpy/core/bento.info
index aaf261ddc..0c335e08a 100644
--- a/numpy/core/bento.info
+++ b/numpy/core/bento.info
@@ -22,7 +22,8 @@ Library:
src/multiarray/multiarraymodule_onefile.c
Extension: multiarray_tests
Sources:
- src/multiarray/multiarray_tests.c.src
+ src/multiarray/multiarray_tests.c.src,
+ src/private/mem_overlap.c
Extension: umath
Sources:
src/umath/umathmodule_onefile.c
diff --git a/numpy/core/bscript b/numpy/core/bscript
index 4eea31ed7..944c2996f 100644
--- a/numpy/core/bscript
+++ b/numpy/core/bscript
@@ -484,6 +484,7 @@ def pre_build(context):
pjoin('src', 'multiarray', 'usertypes.c'),
pjoin('src', 'multiarray', 'vdot.c'),
pjoin('src', 'private', 'templ_common.h.src'),
+ pjoin('src', 'private', 'mem_overlap.c'),
]
if bld.env.HAS_CBLAS:
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index 532ef2950..05fea557a 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -1,9 +1,9 @@
from __future__ import division, absolute_import, print_function
-__all__ = ['logspace', 'linspace']
+__all__ = ['logspace', 'linspace', 'may_share_memory']
from . import numeric as _nx
-from .numeric import result_type, NaN
+from .numeric import result_type, NaN, shares_memory, MAY_SHARE_BOUNDS, TooHardError
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None):
@@ -201,3 +201,46 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None):
if dtype is None:
return _nx.power(base, y)
return _nx.power(base, y).astype(dtype)
+
+
+def may_share_memory(a, b, max_work=None):
+ """Determine if two arrays can share memory
+
+ A return of True does not necessarily mean that the two arrays
+ share any element. It just means that they *might*.
+
+ Only the memory bounds of a and b are checked by default.
+
+ Parameters
+ ----------
+ a, b : ndarray
+ Input arrays
+ max_work : int, optional
+ Effort to spend on solving the overlap problem. See
+ `shares_memory` for details. Default for ``may_share_memory``
+ is to do a bounds check.
+
+ Returns
+ -------
+ out : bool
+
+ See Also
+ --------
+ shares_memory
+
+ Examples
+ --------
+ >>> np.may_share_memory(np.array([1,2]), np.array([5,8,9]))
+ False
+ >>> x = np.zeros([3, 4])
+ >>> np.may_share_memory(x[:,0], x[:,1])
+ True
+
+ """
+ if max_work is None:
+ max_work = MAY_SHARE_BOUNDS
+ try:
+ return shares_memory(a, b, max_work=max_work)
+ except (TooHardError, OverflowError):
+ # Unable to determine, assume yes
+ return True
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index f0163876f..1b7dfca3e 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -10,6 +10,7 @@ from .umath import (invert, sin, UFUNC_BUFSIZE_DEFAULT, ERR_IGNORE,
ERR_DEFAULT, PINF, NAN)
from . import numerictypes
from .numerictypes import longlong, intc, int_, float_, complex_, bool_
+from ._internal import TooHardError
if sys.version_info[0] >= 3:
import pickle
@@ -39,8 +40,8 @@ __all__ = [
'getbufsize', 'seterrcall', 'geterrcall', 'errstate', 'flatnonzero',
'Inf', 'inf', 'infty', 'Infinity', 'nan', 'NaN', 'False_', 'True_',
'bitwise_not', 'CLIP', 'RAISE', 'WRAP', 'MAXDIMS', 'BUFSIZE',
- 'ALLOW_THREADS', 'ComplexWarning', 'may_share_memory', 'full',
- 'full_like', 'matmul',
+ 'ALLOW_THREADS', 'ComplexWarning', 'full', 'full_like', 'matmul',
+ 'shares_memory', 'MAY_SHARE_BOUNDS', 'MAY_SHARE_EXACT', 'TooHardError',
]
if sys.version_info[0] < 3:
@@ -65,6 +66,8 @@ RAISE = multiarray.RAISE
MAXDIMS = multiarray.MAXDIMS
ALLOW_THREADS = multiarray.ALLOW_THREADS
BUFSIZE = multiarray.BUFSIZE
+MAY_SHARE_BOUNDS = multiarray.MAY_SHARE_BOUNDS
+MAY_SHARE_EXACT = multiarray.MAY_SHARE_EXACT
ndarray = multiarray.ndarray
flatiter = multiarray.flatiter
@@ -375,7 +378,7 @@ fromstring = multiarray.fromstring
fromiter = multiarray.fromiter
fromfile = multiarray.fromfile
frombuffer = multiarray.frombuffer
-may_share_memory = multiarray.may_share_memory
+shares_memory = multiarray.shares_memory
if sys.version_info[0] < 3:
newbuffer = multiarray.newbuffer
getbuffer = multiarray.getbuffer
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 99084b25e..6d9926d89 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -762,6 +762,8 @@ def configuration(parent_package='',top_path=None):
join('src', 'private', 'npy_config.h'),
join('src', 'private', 'templ_common.h.src'),
join('src', 'private', 'lowlevel_strided_loops.h'),
+ join('src', 'private', 'mem_overlap.h'),
+ join('src', 'private', 'npy_extint128.h'),
join('include', 'numpy', 'arrayobject.h'),
join('include', 'numpy', '_neighborhood_iterator_imp.h'),
join('include', 'numpy', 'npy_endian.h'),
@@ -831,6 +833,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'ucsnarrow.c'),
join('src', 'multiarray', 'vdot.c'),
join('src', 'private', 'templ_common.h.src'),
+ join('src', 'private', 'mem_overlap.c'),
]
blas_info = get_info('blas_opt', 0)
@@ -959,7 +962,10 @@ def configuration(parent_package='',top_path=None):
#######################################################################
config.add_extension('multiarray_tests',
- sources=[join('src', 'multiarray', 'multiarray_tests.c.src')])
+ sources=[join('src', 'multiarray', 'multiarray_tests.c.src'),
+ join('src', 'private', 'mem_overlap.c')],
+ depends=[join('src', 'private', 'mem_overlap.h'),
+ join('src', 'private', 'npy_extint128.h')])
#######################################################################
# operand_flag_tests module #
diff --git a/numpy/core/src/multiarray/array_assign.c b/numpy/core/src/multiarray/array_assign.c
index fa764d758..a48e245d8 100644
--- a/numpy/core/src/multiarray/array_assign.c
+++ b/numpy/core/src/multiarray/array_assign.c
@@ -23,6 +23,7 @@
#include "array_assign.h"
#include "common.h"
#include "lowlevel_strided_loops.h"
+#include "mem_overlap.h"
/* See array_assign.h for parameter documentation */
NPY_NO_EXPORT int
@@ -101,27 +102,17 @@ raw_array_is_aligned(int ndim, char *data, npy_intp *strides, int alignment)
}
-/* Gets a half-open range [start, end) which contains the array data */
-NPY_NO_EXPORT void
-get_array_memory_extents(PyArrayObject *arr,
- npy_uintp *out_start, npy_uintp *out_end)
-{
- npy_intp low, upper;
- offset_bounds_from_strides(PyArray_ITEMSIZE(arr), PyArray_NDIM(arr),
- PyArray_DIMS(arr), PyArray_STRIDES(arr),
- &low, &upper);
- *out_start = (npy_uintp)PyArray_DATA(arr) + (npy_uintp)low;
- *out_end = (npy_uintp)PyArray_DATA(arr) + (npy_uintp)upper;
-}
-
/* Returns 1 if the arrays have overlapping data, 0 otherwise */
NPY_NO_EXPORT int
arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2)
{
- npy_uintp start1 = 0, start2 = 0, end1 = 0, end2 = 0;
-
- get_array_memory_extents(arr1, &start1, &end1);
- get_array_memory_extents(arr2, &start2, &end2);
+ mem_overlap_t result;
- return (start1 < end2) && (start2 < end1);
+ result = solve_may_share_memory(arr1, arr2, NPY_MAY_SHARE_BOUNDS);
+ if (result == MEM_OVERLAP_NO) {
+ return 0;
+ }
+ else {
+ return 1;
+ }
}
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 86ea38cd7..fd5b15a0a 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -51,6 +51,7 @@ maintainer email: oliphant.travis@ieee.org
#include "buffer.h"
#include "array_assign.h"
#include "alloc.h"
+#include "mem_overlap.h"
/*NUMPY_API
Compute the size of an array (in number of items)
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index a5f3b3d55..3352c3529 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -765,39 +765,6 @@ _IsWriteable(PyArrayObject *ap)
}
-/* Gets a half-open range [start, end) of offsets from the data pointer */
-NPY_NO_EXPORT void
-offset_bounds_from_strides(const int itemsize, const int nd,
- const npy_intp *dims, const npy_intp *strides,
- npy_intp *lower_offset, npy_intp *upper_offset) {
- npy_intp max_axis_offset;
- npy_intp lower = 0;
- npy_intp upper = 0;
- int i;
-
- for (i = 0; i < nd; i++) {
- if (dims[i] == 0) {
- /* If the array size is zero, return an empty range */
- *lower_offset = 0;
- *upper_offset = 0;
- return;
- }
- /* Expand either upwards or downwards depending on stride */
- max_axis_offset = strides[i] * (dims[i] - 1);
- if (max_axis_offset > 0) {
- upper += max_axis_offset;
- }
- else {
- lower += max_axis_offset;
- }
- }
- /* Return a half-open range */
- upper += itemsize;
- *lower_offset = lower;
- *upper_offset = upper;
-}
-
-
/**
* Convert an array shape to a string such as "(1, 2)".
*
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 9cf2e27bf..11993829f 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -65,11 +65,6 @@ _IsAligned(PyArrayObject *ap);
NPY_NO_EXPORT npy_bool
_IsWriteable(PyArrayObject *ap);
-NPY_NO_EXPORT void
-offset_bounds_from_strides(const int itemsize, const int nd,
- const npy_intp *dims, const npy_intp *strides,
- npy_intp *lower_offset, npy_intp *upper_offset);
-
NPY_NO_EXPORT PyObject *
convert_shape_to_string(npy_intp n, npy_intp *vals, char *ending);
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index 0b694deed..5147b9735 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -17,6 +17,7 @@
#include "descriptor.h"
#include "getset.h"
#include "arrayobject.h"
+#include "mem_overlap.h"
/******************* array attribute get and set routines ******************/
diff --git a/numpy/core/src/multiarray/multiarray_tests.c.src b/numpy/core/src/multiarray/multiarray_tests.c.src
index e7f8e36fd..5e247e15e 100644
--- a/numpy/core/src/multiarray/multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/multiarray_tests.c.src
@@ -1,6 +1,9 @@
+/* -*-c-*- */
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#include <Python.h>
#include "numpy/arrayobject.h"
+#include "mem_overlap.h"
+#include "npy_extint128.h"
/* test PyArray_IsPythonScalar, before including private py3 compat header */
static PyObject *
@@ -947,6 +950,607 @@ test_nditer_too_large(PyObject *NPY_UNUSED(self), PyObject *args) {
}
+static PyObject *
+array_solve_diophantine(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+{
+ PyObject *A = NULL;
+ PyObject *U = NULL;
+ Py_ssize_t b_input = 0;
+ Py_ssize_t max_work = -1;
+ int simplify = 0;
+ int require_ub_nontrivial = 0;
+ static char *kwlist[] = {"A", "U", "b", "max_work", "simplify",
+ "require_ub_nontrivial", NULL};
+
+ diophantine_term_t terms[2*NPY_MAXDIMS+2];
+ npy_int64 x[2*NPY_MAXDIMS+2];
+ npy_int64 b;
+ unsigned int nterms, j;
+ mem_overlap_t result = MEM_OVERLAP_YES;
+ PyObject *retval = NULL;
+ NPY_BEGIN_THREADS_DEF;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!O!n|nii", kwlist,
+ &PyTuple_Type, &A,
+ &PyTuple_Type, &U,
+ &b_input, &max_work, &simplify,
+ &require_ub_nontrivial)) {
+ return NULL;
+ }
+
+ if (PyTuple_GET_SIZE(A) > sizeof(terms) / sizeof(diophantine_term_t)) {
+ PyErr_SetString(PyExc_ValueError, "too many terms in equation");
+ goto fail;
+ }
+
+ nterms = PyTuple_GET_SIZE(A);
+
+ if (PyTuple_GET_SIZE(U) != nterms) {
+ PyErr_SetString(PyExc_ValueError, "A, U must be tuples of equal length");
+ goto fail;
+ }
+
+ for (j = 0; j < nterms; ++j) {
+ terms[j].a = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(A, j));
+ if (terms[j].a == -1 && PyErr_Occurred()) {
+ goto fail;
+ }
+ terms[j].ub = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(U, j));
+ if (terms[j].ub == -1 && PyErr_Occurred()) {
+ goto fail;
+ }
+ }
+
+ b = b_input;
+
+ NPY_BEGIN_THREADS;
+ if (simplify && !require_ub_nontrivial) {
+ if (diophantine_simplify(&nterms, terms, b)) {
+ result = MEM_OVERLAP_OVERFLOW;
+ }
+ }
+ if (result == MEM_OVERLAP_YES) {
+ result = solve_diophantine(nterms, terms, b, max_work, require_ub_nontrivial, x);
+ }
+ NPY_END_THREADS;
+
+ if (result == MEM_OVERLAP_YES) {
+ retval = PyTuple_New(nterms);
+ if (retval == NULL) {
+ goto fail;
+ }
+
+ for (j = 0; j < nterms; ++j) {
+ PyObject *obj;
+#if defined(NPY_PY3K)
+ obj = PyLong_FromSsize_t(x[j]);
+#else
+ obj = PyInt_FromSsize_t(x[j]);
+#endif
+ if (obj == NULL) {
+ goto fail;
+ }
+ PyTuple_SET_ITEM(retval, j, obj);
+ }
+ }
+ else if (result == MEM_OVERLAP_NO) {
+ retval = Py_None;
+ Py_INCREF(retval);
+ }
+ else if (result == MEM_OVERLAP_ERROR) {
+ PyErr_SetString(PyExc_ValueError, "Invalid arguments");
+ }
+ else if (result == MEM_OVERLAP_OVERFLOW) {
+ PyErr_SetString(PyExc_OverflowError, "Integer overflow");
+ }
+ else if (result == MEM_OVERLAP_TOO_HARD) {
+ PyErr_SetString(PyExc_RuntimeError, "Too much work done");
+ }
+ else {
+ PyErr_SetString(PyExc_RuntimeError, "Unknown error");
+ }
+
+ return retval;
+
+fail:
+ Py_XDECREF(retval);
+ return NULL;
+}
+
+
+static PyObject *
+array_internal_overlap(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+{
+ PyArrayObject * self = NULL;
+ static char *kwlist[] = {"self", "max_work", NULL};
+
+ mem_overlap_t result;
+ Py_ssize_t max_work = NPY_MAY_SHARE_EXACT;
+ NPY_BEGIN_THREADS_DEF;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|n", kwlist,
+ PyArray_Converter, &self,
+ &max_work)) {
+ return NULL;
+ }
+
+ if (max_work < -2) {
+ PyErr_SetString(PyExc_ValueError, "Invalid value for max_work");
+ goto fail;
+ }
+
+ NPY_BEGIN_THREADS;
+ result = solve_may_have_internal_overlap(self, max_work);
+ NPY_END_THREADS;
+
+ Py_XDECREF(self);
+
+ if (result == MEM_OVERLAP_NO) {
+ Py_RETURN_FALSE;
+ }
+ else if (result == MEM_OVERLAP_YES) {
+ Py_RETURN_TRUE;
+ }
+ else if (result == MEM_OVERLAP_OVERFLOW) {
+ PyErr_SetString(PyExc_OverflowError,
+ "Integer overflow in computing overlap");
+ return NULL;
+ }
+ else if (result == MEM_OVERLAP_TOO_HARD) {
+ PyErr_SetString(PyExc_ValueError,
+ "Exceeded max_work");
+ return NULL;
+ }
+ else {
+ /* Doesn't happen usually */
+ PyErr_SetString(PyExc_RuntimeError,
+ "Error in computing overlap");
+ return NULL;
+ }
+
+fail:
+ Py_XDECREF(self);
+ return NULL;
+}
+
+
+static PyObject *
+pylong_from_int128(npy_extint128_t value)
+{
+ PyObject *val_64 = NULL, *val = NULL, *tmp = NULL, *tmp2 = NULL;
+
+ val_64 = PyLong_FromLong(64);
+ if (val_64 == NULL) {
+ goto fail;
+ }
+
+ val = PyLong_FromUnsignedLongLong(value.hi);
+ if (val == NULL) {
+ goto fail;
+ }
+
+ tmp = PyNumber_Lshift(val, val_64);
+ if (tmp == NULL) {
+ goto fail;
+ }
+
+ Py_DECREF(val);
+ val = tmp;
+
+ tmp = PyLong_FromUnsignedLongLong(value.lo);
+ if (tmp == NULL) {
+ goto fail;
+ }
+
+ tmp2 = PyNumber_Or(val, tmp);
+ if (tmp2 == NULL) {
+ goto fail;
+ }
+
+ Py_DECREF(val);
+ Py_DECREF(tmp);
+
+ val = NULL;
+ tmp = NULL;
+
+ if (value.sign < 0) {
+ val = PyNumber_Negative(tmp2);
+ if (val == NULL) {
+ goto fail;
+ }
+ Py_DECREF(tmp2);
+ return val;
+ }
+ else {
+ val = tmp2;
+ }
+ return val;
+
+fail:
+ Py_XDECREF(val_64);
+ Py_XDECREF(tmp);
+ Py_XDECREF(tmp2);
+ Py_XDECREF(val);
+ return NULL;
+}
+
+
+static int
+int128_from_pylong(PyObject *obj, npy_extint128_t *result)
+{
+ PyObject *long_obj = NULL, *val_64 = NULL, *val_0 = NULL,
+ *mask_64 = NULL, *max_128 = NULL, *hi_bits = NULL,
+ *lo_bits = NULL, *tmp = NULL;
+ int cmp;
+ int negative_zero = 0;
+
+ if (PyBool_Check(obj)) {
+ /* False means negative zero */
+ negative_zero = 1;
+ }
+
+ long_obj = PyObject_CallFunction((PyObject*)&PyLong_Type, "O", obj);
+ if (long_obj == NULL) {
+ goto fail;
+ }
+
+ val_0 = PyLong_FromLong(0);
+ if (val_0 == NULL) {
+ goto fail;
+ }
+
+ val_64 = PyLong_FromLong(64);
+ if (val_64 == NULL) {
+ goto fail;
+ }
+
+ mask_64 = PyLong_FromUnsignedLongLong(0xffffffffffffffffULL);
+ if (mask_64 == NULL) {
+ goto fail;
+ }
+
+ tmp = PyNumber_Lshift(mask_64, val_64);
+ if (tmp == NULL) {
+ goto fail;
+ }
+ max_128 = PyNumber_Or(tmp, mask_64);
+ if (max_128 == NULL) {
+ goto fail;
+ }
+ Py_DECREF(tmp);
+ tmp = NULL;
+
+ cmp = PyObject_RichCompareBool(long_obj, val_0, Py_LT);
+ if (cmp == -1) {
+ goto fail;
+ }
+ else if (cmp == 1) {
+ tmp = PyNumber_Negative(long_obj);
+ if (tmp == NULL) {
+ goto fail;
+ }
+ Py_DECREF(long_obj);
+ long_obj = tmp;
+ tmp = NULL;
+ result->sign = -1;
+ }
+ else {
+ result->sign = 1;
+ }
+
+ cmp = PyObject_RichCompareBool(long_obj, max_128, Py_GT);
+ if (cmp == 1) {
+ PyErr_SetString(PyExc_OverflowError, "");
+ goto fail;
+ }
+ else if (cmp == -1) {
+ goto fail;
+ }
+
+ hi_bits = PyNumber_Rshift(long_obj, val_64);
+ if (hi_bits == NULL) {
+ goto fail;
+ }
+
+ lo_bits = PyNumber_And(long_obj, mask_64);
+ if (lo_bits == NULL) {
+ goto fail;
+ }
+
+ result->hi = PyLong_AsUnsignedLongLong(hi_bits);
+ if (result->hi == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) {
+ goto fail;
+ }
+
+ result->lo = PyLong_AsUnsignedLongLong(lo_bits);
+ if (result->lo == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) {
+ goto fail;
+ }
+
+ if (negative_zero && result->hi == 0 && result->lo == 0) {
+ result->sign = -1;
+ }
+
+ Py_XDECREF(long_obj);
+ Py_XDECREF(val_64);
+ Py_XDECREF(val_0);
+ Py_XDECREF(mask_64);
+ Py_XDECREF(max_128);
+ Py_XDECREF(hi_bits);
+ Py_XDECREF(lo_bits);
+ Py_XDECREF(tmp);
+ return 0;
+
+fail:
+ Py_XDECREF(long_obj);
+ Py_XDECREF(val_64);
+ Py_XDECREF(val_0);
+ Py_XDECREF(mask_64);
+ Py_XDECREF(max_128);
+ Py_XDECREF(hi_bits);
+ Py_XDECREF(lo_bits);
+ Py_XDECREF(tmp);
+ return -1;
+}
+
+
+static PyObject *
+extint_safe_binop(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PY_LONG_LONG a, b, c;
+ int op;
+ char overflow = 0;
+ if (!PyArg_ParseTuple(args, "LLi", &a, &b, &op)) {
+ return NULL;
+ }
+ if (op == 1) {
+ c = safe_add(a, b, &overflow);
+ }
+ else if (op == 2) {
+ c = safe_sub(a, b, &overflow);
+ }
+ else if (op == 3) {
+ c = safe_mul(a, b, &overflow);
+ }
+ else {
+ PyErr_SetString(PyExc_ValueError, "invalid op");
+ return NULL;
+ }
+ if (overflow) {
+ PyErr_SetString(PyExc_OverflowError, "");
+ return NULL;
+ }
+ return PyLong_FromLongLong(c);
+}
+
+
+static PyObject *
+extint_to_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PY_LONG_LONG a;
+ if (!PyArg_ParseTuple(args, "L", &a)) {
+ return NULL;
+ }
+ return pylong_from_int128(to_128(a));
+}
+
+
+static PyObject *
+extint_to_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj;
+ npy_extint128_t a;
+ PY_LONG_LONG r;
+ char overflow = 0;
+ if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+ return NULL;
+ }
+ if (int128_from_pylong(a_obj, &a)) {
+ return NULL;
+ }
+ r = to_64(a, &overflow);
+ if (overflow) {
+ PyErr_SetString(PyExc_OverflowError, "");
+ return NULL;
+ }
+ return PyLong_FromLongLong(r);
+}
+
+
+static PyObject *
+extint_mul_64_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PY_LONG_LONG a, b;
+ npy_extint128_t c;
+ if (!PyArg_ParseTuple(args, "LL", &a, &b)) {
+ return NULL;
+ }
+ c = mul_64_64(a, b);
+ return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_add_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj, *b_obj;
+ npy_extint128_t a, b, c;
+ char overflow = 0;
+ if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+ return NULL;
+ }
+ if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+ return NULL;
+ }
+ c = add_128(a, b, &overflow);
+ if (overflow) {
+ PyErr_SetString(PyExc_OverflowError, "");
+ return NULL;
+ }
+ return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_sub_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj, *b_obj;
+ npy_extint128_t a, b, c;
+ char overflow = 0;
+ if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+ return NULL;
+ }
+ if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+ return NULL;
+ }
+ c = sub_128(a, b, &overflow);
+ if (overflow) {
+ PyErr_SetString(PyExc_OverflowError, "");
+ return NULL;
+ }
+ return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_neg_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj;
+ npy_extint128_t a, b;
+ if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+ return NULL;
+ }
+ if (int128_from_pylong(a_obj, &a)) {
+ return NULL;
+ }
+ b = neg_128(a);
+ return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_shl_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj;
+ npy_extint128_t a, b;
+ if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+ return NULL;
+ }
+ if (int128_from_pylong(a_obj, &a)) {
+ return NULL;
+ }
+ b = shl_128(a);
+ return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_shr_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj;
+ npy_extint128_t a, b;
+ if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+ return NULL;
+ }
+ if (int128_from_pylong(a_obj, &a)) {
+ return NULL;
+ }
+ b = shr_128(a);
+ return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_gt_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj, *b_obj;
+ npy_extint128_t a, b;
+ if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+ return NULL;
+ }
+ if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+ return NULL;
+ }
+ if (gt_128(a, b)) {
+ Py_RETURN_TRUE;
+ }
+ else {
+ Py_RETURN_FALSE;
+ }
+}
+
+
+static PyObject *
+extint_divmod_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj, *ret = NULL, *tmp = NULL;
+ npy_extint128_t a, c;
+ PY_LONG_LONG b;
+ npy_int64 mod;
+ if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+ goto fail;
+ }
+ if (b <= 0) {
+ PyErr_SetString(PyExc_ValueError, "");
+ goto fail;
+ }
+ if (int128_from_pylong(a_obj, &a)) {
+ goto fail;
+ }
+
+ c = divmod_128_64(a, b, &mod);
+
+ ret = PyTuple_New(2);
+
+ tmp = pylong_from_int128(c);
+ if (tmp == NULL) {
+ goto fail;
+ }
+ PyTuple_SET_ITEM(ret, 0, tmp);
+
+ tmp = PyLong_FromLongLong(mod);
+ if (tmp == NULL) {
+ goto fail;
+ }
+ PyTuple_SET_ITEM(ret, 1, tmp);
+ return ret;
+
+fail:
+ Py_XDECREF(ret);
+ Py_XDECREF(tmp);
+ return NULL;
+}
+
+
+static PyObject *
+extint_floordiv_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj;
+ npy_extint128_t a, c;
+ PY_LONG_LONG b;
+ if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+ return NULL;
+ }
+ if (b <= 0) {
+ PyErr_SetString(PyExc_ValueError, "");
+ return NULL;
+ }
+ if (int128_from_pylong(a_obj, &a)) {
+ return NULL;
+ }
+ c = floordiv_128_64(a, b);
+ return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_ceildiv_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+ PyObject *a_obj;
+ npy_extint128_t a, c;
+ PY_LONG_LONG b;
+ if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+ return NULL;
+ }
+ if (b <= 0) {
+ PyErr_SetString(PyExc_ValueError, "");
+ return NULL;
+ }
+ if (int128_from_pylong(a_obj, &a)) {
+ return NULL;
+ }
+ c = ceildiv_128_64(a, b);
+ return pylong_from_int128(c);
+}
+
+
static PyMethodDef Multiarray_TestsMethods[] = {
{"IsPythonScalar",
IsPythonScalar,
@@ -989,6 +1593,51 @@ static PyMethodDef Multiarray_TestsMethods[] = {
{"test_nditer_too_large",
test_nditer_too_large,
METH_VARARGS, NULL},
+ {"solve_diophantine",
+ (PyCFunction)array_solve_diophantine,
+ METH_VARARGS | METH_KEYWORDS, NULL},
+ {"internal_overlap",
+ (PyCFunction)array_internal_overlap,
+ METH_VARARGS | METH_KEYWORDS, NULL},
+ {"extint_safe_binop",
+ extint_safe_binop,
+ METH_VARARGS, NULL},
+ {"extint_to_128",
+ extint_to_128,
+ METH_VARARGS, NULL},
+ {"extint_to_64",
+ extint_to_64,
+ METH_VARARGS, NULL},
+ {"extint_mul_64_64",
+ extint_mul_64_64,
+ METH_VARARGS, NULL},
+ {"extint_add_128",
+ extint_add_128,
+ METH_VARARGS, NULL},
+ {"extint_sub_128",
+ extint_sub_128,
+ METH_VARARGS, NULL},
+ {"extint_neg_128",
+ extint_neg_128,
+ METH_VARARGS, NULL},
+ {"extint_shl_128",
+ extint_shl_128,
+ METH_VARARGS, NULL},
+ {"extint_shr_128",
+ extint_shr_128,
+ METH_VARARGS, NULL},
+ {"extint_gt_128",
+ extint_gt_128,
+ METH_VARARGS, NULL},
+ {"extint_divmod_128_64",
+ extint_divmod_128_64,
+ METH_VARARGS, NULL},
+ {"extint_floordiv_128_64",
+ extint_floordiv_128_64,
+ METH_VARARGS, NULL},
+ {"extint_ceildiv_128_64",
+ extint_ceildiv_128_64,
+ METH_VARARGS, NULL},
{NULL, NULL, 0, NULL} /* Sentinel */
};
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 04513c56c..e72c355dc 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -60,6 +60,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
#include "vdot.h"
#include "templ_common.h" /* for npy_mul_with_overflow_intp */
#include "compiled_base.h"
+#include "mem_overlap.h"
/* Only here for API compatibility */
NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
@@ -3993,30 +3994,88 @@ test_interrupt(PyObject *NPY_UNUSED(self), PyObject *args)
return PyInt_FromLong(a);
}
+
static PyObject *
-array_may_share_memory(PyObject *NPY_UNUSED(ignored), PyObject *args)
+array_shares_memory(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
{
PyArrayObject * self = NULL;
PyArrayObject * other = NULL;
- int overlap;
+ PyObject *max_work_obj = NULL;
+ static char *kwlist[] = {"self", "other", "max_work", NULL};
+
+ mem_overlap_t result;
+ static PyObject *too_hard_cls = NULL;
+ Py_ssize_t max_work = NPY_MAY_SHARE_EXACT;
+ NPY_BEGIN_THREADS_DEF;
- if (!PyArg_ParseTuple(args, "O&O&", PyArray_Converter, &self,
- PyArray_Converter, &other)) {
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&O&|O", kwlist,
+ PyArray_Converter, &self,
+ PyArray_Converter, &other,
+ &max_work_obj)) {
return NULL;
}
- overlap = arrays_overlap(self, other);
+ if (max_work_obj == NULL || max_work_obj == Py_None) {
+ /* noop */
+ }
+ else if (PyLong_Check(max_work_obj)) {
+ max_work = PyLong_AsSsize_t(max_work_obj);
+ }
+#if !defined(NPY_PY3K)
+ else if (PyInt_Check(max_work_obj)) {
+ max_work = PyInt_AsSsize_t(max_work_obj);
+ }
+#endif
+ else {
+ PyErr_SetString(PyExc_ValueError, "max_work must be an integer");
+ goto fail;
+ }
+
+ if (max_work < -2) {
+ PyErr_SetString(PyExc_ValueError, "Invalid value for max_work");
+ goto fail;
+ }
+
+ NPY_BEGIN_THREADS;
+ result = solve_may_share_memory(self, other, max_work);
+ NPY_END_THREADS;
+
Py_XDECREF(self);
Py_XDECREF(other);
- if (overlap) {
+ if (result == MEM_OVERLAP_NO) {
+ Py_RETURN_FALSE;
+ }
+ else if (result == MEM_OVERLAP_YES) {
Py_RETURN_TRUE;
}
+ else if (result == MEM_OVERLAP_OVERFLOW) {
+ PyErr_SetString(PyExc_OverflowError,
+ "Integer overflow in computing overlap");
+ return NULL;
+ }
+ else if (result == MEM_OVERLAP_TOO_HARD) {
+ npy_cache_import("numpy.core._internal", "TooHardError",
+ &too_hard_cls);
+ if (too_hard_cls) {
+ PyErr_SetString(too_hard_cls, "Exceeded max_work");
+ }
+ return NULL;
+ }
else {
- Py_RETURN_FALSE;
+ /* Doesn't happen usually */
+ PyErr_SetString(PyExc_RuntimeError,
+ "Error in computing overlap");
+ return NULL;
}
+
+fail:
+ Py_XDECREF(self);
+ Py_XDECREF(other);
+ return NULL;
}
+
static struct PyMethodDef array_module_methods[] = {
{"_get_ndarray_c_version",
(PyCFunction)array__get_ndarray_c_version,
@@ -4123,9 +4182,9 @@ static struct PyMethodDef array_module_methods[] = {
{"result_type",
(PyCFunction)array_result_type,
METH_VARARGS, NULL},
- {"may_share_memory",
- (PyCFunction)array_may_share_memory,
- METH_VARARGS, NULL},
+ {"shares_memory",
+ (PyCFunction)array_shares_memory,
+ METH_VARARGS | METH_KEYWORDS, NULL},
/* Datetime-related functions */
{"datetime_data",
(PyCFunction)array_datetime_data,
@@ -4583,6 +4642,9 @@ PyMODINIT_FUNC initmultiarray(void) {
ADDCONST(RAISE);
ADDCONST(WRAP);
ADDCONST(MAXDIMS);
+
+ ADDCONST(MAY_SHARE_BOUNDS);
+ ADDCONST(MAY_SHARE_EXACT);
#undef ADDCONST
Py_INCREF(&PyArray_Type);
diff --git a/numpy/core/src/multiarray/multiarraymodule_onefile.c b/numpy/core/src/multiarray/multiarraymodule_onefile.c
index 3940d009b..3924f3cf4 100644
--- a/numpy/core/src/multiarray/multiarraymodule_onefile.c
+++ b/numpy/core/src/multiarray/multiarraymodule_onefile.c
@@ -53,6 +53,7 @@
#include "ucsnarrow.c"
#include "arrayobject.c"
#include "numpymemoryview.c"
+#include "mem_overlap.c"
#include "multiarraymodule.c"
#include "compiled_base.c"
diff --git a/numpy/core/src/private/mem_overlap.c b/numpy/core/src/private/mem_overlap.c
new file mode 100644
index 000000000..3cab83497
--- /dev/null
+++ b/numpy/core/src/private/mem_overlap.c
@@ -0,0 +1,905 @@
+/*
+ Solving memory overlap integer programs and bounded Diophantine equations with
+ positive coefficients.
+
+ Asking whether two strided arrays `a` and `b` overlap is equivalent to
+ asking whether there is a solution to the following problem::
+
+ sum(stride_a[i] * x_a[i] for i in range(ndim_a))
+ -
+ sum(stride_b[i] * x_b[i] for i in range(ndim_b))
+ ==
+ base_b - base_a
+
+ 0 <= x_a[i] < shape_a[i]
+ 0 <= x_b[i] < shape_b[i]
+
+ for some integer x_a, x_b. Itemsize needs to be considered as an additional
+ dimension with stride 1 and size itemsize.
+
+ Negative strides can be changed to positive (and vice versa) by changing
+ variables x[i] -> shape[i] - 1 - x[i], and zero strides can be dropped, so
+ that the problem can be recast into a bounded Diophantine equation with
+ positive coefficients::
+
+ sum(a[i] * x[i] for i in range(n)) == b
+
+ a[i] > 0
+
+ 0 <= x[i] <= ub[i]
+
+ This problem is NP-hard --- runtime of algorithms grows exponentially with
+ increasing ndim.
+
+
+ *Algorithm description*
+
+ A straightforward algorithm that excludes infeasible solutions using GCD-based
+ pruning is outlined in Ref. [1]. It is implemented below. A number of other
+ algorithms exist in the literature; however, this one seems to have
+ performance satisfactory for the present purpose.
+
+ The idea is that an equation::
+
+ a_1 x_1 + a_2 x_2 + ... + a_n x_n = b
+ 0 <= x_i <= ub_i, i = 1...n
+
+ implies::
+
+ a_2' x_2' + a_3 x_3 + ... + a_n x_n = b
+
+ 0 <= x_i <= ub_i, i = 2...n
+
+ 0 <= x_1' <= c_1 ub_1 + c_2 ub_2
+
+ with a_2' = gcd(a_1, a_2) and x_2' = c_1 x_1 + c_2 x_2 with c_1 = (a_1/a_1'),
+ and c_2 = (a_2/a_1'). This procedure can be repeated to obtain::
+
+ a_{n-1}' x_{n-1}' + a_n x_n = b
+
+ 0 <= x_{n-1}' <= ub_{n-1}'
+
+ 0 <= x_n <= ub_n
+
+ Now, one can enumerate all candidate solutions for x_n. For each, one can use
+ the previous-level equation to enumerate potential solutions for x_{n-1}, with
+ transformed right-hand side b -> b - a_n x_n. And so forth, until after n-1
+ nested for loops we either arrive at a candidate solution for x_1 (in which
+ case we have found one solution to the problem), or find that the equations do
+ not allow any solutions either for x_1 or one of the intermediate x_i (in
+ which case we have proved there is no solution for the upper-level candidates
+ chosen). If no solution is found for any candidate x_n, we have proved the
+ problem is infeasible --- which for the memory overlap problem means there is
+ no overlap.
+
+
+ *Performance*
+
+ Some common ndarray cases are easy for the algorithm:
+
+ - Two arrays whose memory ranges do not overlap.
+
+ These will be excluded by the bounds on x_n, with max_work=1. We also add
+ this check as a fast path, to avoid computing GCDs needlessly, as this can
+ take some time.
+
+ - Arrays produced by continuous slicing of a continuous parent array (no
+ internal overlap), e.g., a=x[:,0,:], b=x[:,1,:]. The strides taken together,
+ mapped positive, and duplicates then satisfy gcd(stride[0], .., stride[j]) =
+ stride[j] for some ordering.
+
+ In this case, for each x[i] at most one candidate exists, given that the
+ algorithm runs with strides sorted from largest to smallest. The problem can
+ be written as::
+
+ sum a_j x_j ?= b = sum a_j z_j
+
+ a_j = n_{j+1} * n_{j+2} * ... * n_d, a_d = 1
+ 0 <= x_j <= u_j <= 2*n_j - 2
+ 0 <= z_j <= n_j - 1
+
+ b is the offset of the last element of the second array from the start of
+ the first. z_j are uniquely determined because of the gcd property. For
+ each x_j, the bounds at first sight allow x_j=z_j and x_j=z_j+n_j. However,
+ u_j <= n_j - 1 + z_j, so that at most one candidate is left.
+
+ - Two arrays with stride-incommensurate starting points. For example,
+ a=x[:,::2], b=x[:,1::2].
+
+ The base address difference is incommensurate with all strides, so that
+ there are no solution candidates to consider. For itemsize != 1, similar
+ result is obtained for x_{n-1}.
+
+ The above cases cover arrays produced by typical slicing of well-behaved
+ parent arrays. More generally, more difficult cases can result::
+
+ x = np.arange(4*20).reshape(4, 20).astype(np.int8)
+ a = x[:,::7]
+ b = x[:,3::3]
+
+ <=>
+
+ 20*x1 + 7*x2 + 3*x3 = 78 (= 3 + 3*20 + 5*3)
+ 0 <= x1 <= 6, 0 <= x2 <= 2, 0 <= x3 <= 5
+
+ Non-overlapping in this case relies on x.shape[1] <= lcm(7, 3) = 21. However,
+ elimination of x1 does not restrict candidate values for x3, so the algorithm
+ ends up considering all values x3=0...5 separately.
+
+ The upper bound for work done is prod(shape_a)*prod(shape_b), which scales
+ faster than than work done by binary ufuncs, after broadcasting,
+ prod(shape_a). The bound may be loose, but it is possible to construct hard
+ instances where ufunc is faster (adapted from [2,3])::
+
+ from numpy.lib.stride_tricks import as_strided
+ # Construct non-overlapping x1 and x2
+ x = np.zeros([192163377], dtype=np.int8)
+ x1 = as_strided(x, strides=(36674, 61119, 85569), shape=(1049, 1049, 1049))
+ x2 = as_strided(x[64023025:], strides=(12223, 12224, 1), shape=(1049, 1049, 1))
+
+ To avoid such worst cases, the amount of work done needs to be capped. If the
+ overlap problem is related to ufuncs, one suitable cap choice is to scale
+ max_work with the number of elements of the array. (Ref. [3] describes a more
+ efficient algorithm for solving problems similar to the above --- however,
+ also it must scale exponentially.)
+
+
+ *Integer overflows*
+
+ The algorithm is written in fixed-width integers, and can terminate with
+ failure if integer overflow is detected (the implementation catches all
+ cases). Potential failure modes:
+
+ - Array extent sum(stride*(shape-1)) is too large (for int64).
+
+ - Minimal solutions to a_i x_i + a_j x_j == b are too large,
+ in some of the intermediate equations.
+
+ We do this part of the computation in 128-bit integers.
+
+ In general, overflows are expected only if array size is close to
+ NPY_INT64_MAX, requiring ~exabyte size arrays, which is usually not possible.
+
+ References
+ ----------
+ .. [1] P. Ramachandran, ''Use of Extended Euclidean Algorithm in Solving
+ a System of Linear Diophantine Equations with Bounded Variables''.
+ Algorithmic Number Theory, Lecture Notes in Computer Science **4076**,
+ 182-192 (2006). doi:10.1007/11792086_14
+
+ .. [2] Cornuejols, Urbaniak, Weismantel, and Wolsey,
+ ''Decomposition of integer programs and of generating sets.'',
+ Lecture Notes in Computer Science 1284, 92-103 (1997).
+
+ .. [3] K. Aardal, A.K. Lenstra,
+ ''Hard equality constrained integer knapsacks'',
+ Lecture Notes in Computer Science 2337, 350-366 (2002).
+*/
+
+/*
+ Copyright (c) 2015 Pauli Virtanen
+ All rights reserved.
+ Licensed under 3-clause BSD license, see LICENSE.txt.
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "numpy/ndarraytypes.h"
+#include "mem_overlap.h"
+#include "npy_extint128.h"
+
+
+#define MAX(a, b) (((a) >= (b)) ? (a) : (b))
+#define MIN(a, b) (((a) <= (b)) ? (a) : (b))
+
+
+/**
+ * Euclid's algorithm for GCD.
+ *
+ * Solves for gamma*a1 + epsilon*a2 == gcd(a1, a2)
+ * providing |gamma| < |a2|/gcd, |epsilon| < |a1|/gcd.
+ */
+static void
+euclid(npy_int64 a1, npy_int64 a2, npy_int64 *a_gcd, npy_int64 *gamma, npy_int64 *epsilon)
+{
+ npy_int64 gamma1, gamma2, epsilon1, epsilon2, r;
+
+ assert(a1 > 0);
+ assert(a2 > 0);
+
+ gamma1 = 1;
+ gamma2 = 0;
+ epsilon1 = 0;
+ epsilon2 = 1;
+
+ /* The numbers remain bounded by |a1|, |a2| during
+ the iteration, so no integer overflows */
+ while (1) {
+ if (a2 > 0) {
+ r = a1/a2;
+ a1 -= r*a2;
+ gamma1 -= r*gamma2;
+ epsilon1 -= r*epsilon2;
+ }
+ else {
+ *a_gcd = a1;
+ *gamma = gamma1;
+ *epsilon = epsilon1;
+ break;
+ }
+
+ if (a1 > 0) {
+ r = a2/a1;
+ a2 -= r*a1;
+ gamma2 -= r*gamma1;
+ epsilon2 -= r*epsilon1;
+ }
+ else {
+ *a_gcd = a2;
+ *gamma = gamma2;
+ *epsilon = epsilon2;
+ break;
+ }
+ }
+}
+
+
+/**
+ * Precompute GCD and bounds transformations
+ */
+static int
+diophantine_precompute(unsigned int n,
+ diophantine_term_t *E,
+ diophantine_term_t *Ep,
+ npy_int64 *Gamma, npy_int64 *Epsilon)
+{
+ npy_int64 a_gcd, gamma, epsilon, c1, c2;
+ unsigned int j;
+ char overflow = 0;
+
+ assert(n >= 2);
+
+ euclid(E[0].a, E[1].a, &a_gcd, &gamma, &epsilon);
+ Ep[0].a = a_gcd;
+ Gamma[0] = gamma;
+ Epsilon[0] = epsilon;
+
+ if (n > 2) {
+ c1 = E[0].a / a_gcd;
+ c2 = E[1].a / a_gcd;
+
+ /* Ep[0].ub = E[0].ub * c1 + E[1].ub * c2; */
+ Ep[0].ub = safe_add(safe_mul(E[0].ub, c1, &overflow),
+ safe_mul(E[1].ub, c2, &overflow), &overflow);
+ if (overflow) {
+ return 1;
+ }
+ }
+
+ for (j = 2; j < n; ++j) {
+ euclid(Ep[j-2].a, E[j].a, &a_gcd, &gamma, &epsilon);
+ Ep[j-1].a = a_gcd;
+ Gamma[j-1] = gamma;
+ Epsilon[j-1] = epsilon;
+
+ if (j < n - 1) {
+ c1 = Ep[j-2].a / a_gcd;
+ c2 = E[j].a / a_gcd;
+
+ /* Ep[j-1].ub = c1 * Ep[j-2].ub + c2 * E[j].ub; */
+ Ep[j-1].ub = safe_add(safe_mul(c1, Ep[j-2].ub, &overflow),
+ safe_mul(c2, E[j].ub, &overflow), &overflow);
+
+ if (overflow) {
+ return 1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+/**
+ * Depth-first bounded Euclid search
+ */
+static mem_overlap_t
+diophantine_dfs(unsigned int n,
+ unsigned int v,
+ diophantine_term_t *E,
+ diophantine_term_t *Ep,
+ npy_int64 *Gamma, npy_int64 *Epsilon,
+ npy_int64 b,
+ Py_ssize_t max_work,
+ int require_ub_nontrivial,
+ npy_int64 *x,
+ Py_ssize_t *count)
+{
+ npy_int64 a_gcd, gamma, epsilon, a1, u1, a2, u2, c, r, c1, c2, t, t_l, t_u, b2, x1, x2;
+ npy_extint128_t x10, x20, t_l1, t_l2, t_u1, t_u2;
+ mem_overlap_t res;
+ char overflow = 0;
+
+ if (max_work >= 0 && *count >= max_work) {
+ return MEM_OVERLAP_TOO_HARD;
+ }
+
+ /* Fetch precomputed values for the reduced problem */
+ if (v == 1) {
+ a1 = E[0].a;
+ u1 = E[0].ub;
+ }
+ else {
+ a1 = Ep[v-2].a;
+ u1 = Ep[v-2].ub;
+ }
+
+ a2 = E[v].a;
+ u2 = E[v].ub;
+
+ a_gcd = Ep[v-1].a;
+ gamma = Gamma[v-1];
+ epsilon = Epsilon[v-1];
+
+ /* Generate set of allowed solutions */
+ c = b / a_gcd;
+ r = b % a_gcd;
+ if (r != 0) {
+ ++*count;
+ return MEM_OVERLAP_NO;
+ }
+
+ c1 = a2 / a_gcd;
+ c2 = a1 / a_gcd;
+
+ /*
+ The set to enumerate is:
+ x1 = gamma*c + c1*t
+ x2 = epsilon*c - c2*t
+ t integer
+ 0 <= x1 <= u1
+ 0 <= x2 <= u2
+ and we have c, c1, c2 >= 0
+ */
+
+ x10 = mul_64_64(gamma, c);
+ x20 = mul_64_64(epsilon, c);
+
+ t_l1 = ceildiv_128_64(neg_128(x10), c1);
+ t_l2 = ceildiv_128_64(sub_128(x20, to_128(u2), &overflow), c2);
+
+ t_u1 = floordiv_128_64(sub_128(to_128(u1), x10, &overflow), c1);
+ t_u2 = floordiv_128_64(x20, c2);
+
+ if (overflow) {
+ return MEM_OVERLAP_OVERFLOW;
+ }
+
+ if (gt_128(t_l2, t_l1)) {
+ t_l1 = t_l2;
+ }
+
+ if (gt_128(t_u1, t_u2)) {
+ t_u1 = t_u2;
+ }
+
+ if (gt_128(t_l1, t_u1)) {
+ ++*count;
+ return MEM_OVERLAP_NO;
+ }
+
+ t_l = to_64(t_l1, &overflow);
+ t_u = to_64(t_u1, &overflow);
+
+ x10 = add_128(x10, mul_64_64(c1, t_l), &overflow);
+ x20 = sub_128(x20, mul_64_64(c2, t_l), &overflow);
+
+ t_u = safe_sub(t_u, t_l, &overflow);
+ t_l = 0;
+ x1 = to_64(x10, &overflow);
+ x2 = to_64(x20, &overflow);
+
+ if (overflow) {
+ return MEM_OVERLAP_OVERFLOW;
+ }
+
+ /* The bounds t_l, t_u ensure the x computed below do not overflow */
+
+ if (v == 1) {
+ /* Base case */
+ if (t_u >= t_l) {
+ x[0] = x1 + c1*t_l;
+ x[1] = x2 - c2*t_l;
+ if (require_ub_nontrivial) {
+ int j, is_ub_trivial;
+
+ is_ub_trivial = 1;
+ for (j = 0; j < n; ++j) {
+ if (x[j] != E[j].ub/2) {
+ is_ub_trivial = 0;
+ break;
+ }
+ }
+
+ if (is_ub_trivial) {
+ /* Ignore 'trivial' solution */
+ ++*count;
+ return MEM_OVERLAP_NO;
+ }
+ }
+ return MEM_OVERLAP_YES;
+ }
+ ++*count;
+ return MEM_OVERLAP_NO;
+ }
+ else {
+ /* Recurse to all candidates */
+ for (t = t_l; t <= t_u; ++t) {
+ x[v] = x2 - c2*t;
+
+ /* b2 = b - a2*x[v]; */
+ b2 = safe_sub(b, safe_mul(a2, x[v], &overflow), &overflow);
+ if (overflow) {
+ return MEM_OVERLAP_OVERFLOW;
+ }
+
+ res = diophantine_dfs(n, v-1, E, Ep, Gamma, Epsilon,
+ b2, max_work, require_ub_nontrivial,
+ x, count);
+ if (res != MEM_OVERLAP_NO) {
+ return res;
+ }
+ }
+ ++*count;
+ return MEM_OVERLAP_NO;
+ }
+}
+
+
+/**
+ * Solve bounded Diophantine equation
+ *
+ * The problem considered is::
+ *
+ * A[0] x[0] + A[1] x[1] + ... + A[n-1] x[n-1] == b
+ * 0 <= x[i] <= U[i]
+ * A[i] > 0
+ *
+ * Solve via depth-first Euclid's algorithm, as explained in [1].
+ *
+ * If require_ub_nontrivial!=0, look for solutions to the problem
+ * where b = A[0]*(U[0]/2) + ... + A[n]*(U[n-1]/2) but ignoring
+ * the trivial solution x[i] = U[i]/2. All U[i] must be divisible by 2.
+ * The value given for `b` is ignored in this case.
+ */
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_diophantine(unsigned int n, diophantine_term_t *E, npy_int64 b,
+ Py_ssize_t max_work, int require_ub_nontrivial, npy_int64 *x)
+{
+ unsigned int j;
+
+ for (j = 0; j < n; ++j) {
+ if (E[j].a <= 0) {
+ return MEM_OVERLAP_ERROR;
+ }
+ else if (E[j].ub < 0) {
+ return MEM_OVERLAP_NO;
+ }
+ }
+
+ if (require_ub_nontrivial) {
+ npy_int64 ub_sum = 0;
+ char overflow = 0;
+ for (j = 0; j < n; ++j) {
+ if (E[j].ub % 2 != 0) {
+ return MEM_OVERLAP_ERROR;
+ }
+ ub_sum = safe_add(ub_sum,
+ safe_mul(E[j].a, E[j].ub/2, &overflow),
+ &overflow);
+ }
+ if (overflow) {
+ return MEM_OVERLAP_ERROR;
+ }
+ b = ub_sum;
+ }
+
+ if (b < 0) {
+ return MEM_OVERLAP_NO;
+ }
+
+ if (n == 0) {
+ if (require_ub_nontrivial) {
+ /* Only trivial solution for 0-variable problem */
+ return MEM_OVERLAP_NO;
+ }
+ if (b == 0) {
+ return MEM_OVERLAP_YES;
+ }
+ return MEM_OVERLAP_NO;
+ }
+ else if (n == 1) {
+ if (require_ub_nontrivial) {
+ /* Only trivial solution for 1-variable problem */
+ return MEM_OVERLAP_NO;
+ }
+ if (b % E[0].a == 0) {
+ x[0] = b / E[0].a;
+ if (x[0] >= 0 && x[0] <= E[0].ub) {
+ return MEM_OVERLAP_YES;
+ }
+ }
+ return MEM_OVERLAP_NO;
+ }
+ else {
+ diophantine_term_t Ep[n];
+ npy_int64 Epsilon[n], Gamma[n];
+ Py_ssize_t count = 0;
+
+ if (diophantine_precompute(n, E, Ep, Gamma, Epsilon)) {
+ return MEM_OVERLAP_OVERFLOW;
+ }
+ return diophantine_dfs(n, n-1, E, Ep, Gamma, Epsilon, b, max_work,
+ require_ub_nontrivial, x, &count);
+ }
+}
+
+
+static int
+diophantine_sort_A(const void *xp, const void *yp)
+{
+ npy_int64 xa = ((diophantine_term_t*)xp)->a;
+ npy_int64 ya = ((diophantine_term_t*)yp)->a;
+
+ if (xa < ya) {
+ return 1;
+ }
+ else if (ya < xa) {
+ return -1;
+ }
+ else {
+ return 0;
+ }
+}
+
+
+/**
+ * Simplify Diophantine decision problem.
+ *
+ * Combine identical coefficients, remove unnecessary variables, and trim
+ * bounds.
+ *
+ * The feasible/infeasible decision result is retained.
+ *
+ * Returns: 0 (success), -1 (integer overflow).
+ */
+NPY_VISIBILITY_HIDDEN int
+diophantine_simplify(unsigned int *n, diophantine_term_t *E, npy_int64 b)
+{
+ unsigned int i, j, m;
+ char overflow = 0;
+
+ /* Skip obviously infeasible cases */
+ for (j = 0; j < *n; ++j) {
+ if (E[j].ub < 0) {
+ return 0;
+ }
+ }
+
+ if (b < 0) {
+ return 0;
+ }
+
+ /* Sort vs. coefficients */
+ qsort(E, *n, sizeof(diophantine_term_t), diophantine_sort_A);
+
+ /* Combine identical coefficients */
+ m = *n;
+ i = 0;
+ for (j = 1; j < m; ++j) {
+ if (E[i].a == E[j].a) {
+ E[i].ub = safe_add(E[i].ub, E[j].ub, &overflow);
+ --*n;
+ }
+ else {
+ ++i;
+ if (i != j) {
+ E[i] = E[j];
+ }
+ }
+ }
+
+ /* Trim bounds and remove unnecessary variables */
+ m = *n;
+ i = 0;
+ for (j = 0; j < m; ++j) {
+ E[j].ub = MIN(E[j].ub, b / E[j].a);
+ if (E[j].ub == 0) {
+ /* If the problem is feasible at all, x[i]=0 */
+ --*n;
+ }
+ else {
+ if (i != j) {
+ E[i] = E[j];
+ }
+ ++i;
+ }
+ }
+
+ if (overflow) {
+ return -1;
+ }
+ else {
+ return 0;
+ }
+}
+
+
+/* Gets a half-open range [start, end) of offsets from the data pointer */
+NPY_VISIBILITY_HIDDEN void
+offset_bounds_from_strides(const int itemsize, const int nd,
+ const npy_intp *dims, const npy_intp *strides,
+ npy_intp *lower_offset, npy_intp *upper_offset)
+{
+ npy_intp max_axis_offset;
+ npy_intp lower = 0;
+ npy_intp upper = 0;
+ int i;
+
+ for (i = 0; i < nd; i++) {
+ if (dims[i] == 0) {
+ /* If the array size is zero, return an empty range */
+ *lower_offset = 0;
+ *upper_offset = 0;
+ return;
+ }
+ /* Expand either upwards or downwards depending on stride */
+ max_axis_offset = strides[i] * (dims[i] - 1);
+ if (max_axis_offset > 0) {
+ upper += max_axis_offset;
+ }
+ else {
+ lower += max_axis_offset;
+ }
+ }
+ /* Return a half-open range */
+ upper += itemsize;
+ *lower_offset = lower;
+ *upper_offset = upper;
+}
+
+
+/* Gets a half-open range [start, end) which contains the array data */
+static void
+get_array_memory_extents(PyArrayObject *arr,
+ npy_uintp *out_start, npy_uintp *out_end,
+ npy_uintp *num_bytes)
+{
+ npy_intp low, upper;
+ int j;
+ offset_bounds_from_strides(PyArray_ITEMSIZE(arr), PyArray_NDIM(arr),
+ PyArray_DIMS(arr), PyArray_STRIDES(arr),
+ &low, &upper);
+ *out_start = (npy_uintp)PyArray_DATA(arr) + (npy_uintp)low;
+ *out_end = (npy_uintp)PyArray_DATA(arr) + (npy_uintp)upper;
+
+ *num_bytes = PyArray_ITEMSIZE(arr);
+ for (j = 0; j < PyArray_NDIM(arr); ++j) {
+ *num_bytes *= PyArray_DIM(arr, j);
+ }
+}
+
+
+static int
+strides_to_terms(PyArrayObject *arr, diophantine_term_t *terms,
+ unsigned int *nterms, int skip_empty)
+{
+ unsigned int i;
+
+ for (i = 0; i < PyArray_NDIM(arr); ++i) {
+ if (skip_empty) {
+ if (PyArray_DIM(arr, i) <= 1 || PyArray_STRIDE(arr, i) == 0) {
+ continue;
+ }
+ }
+
+ terms[*nterms].a = PyArray_STRIDE(arr, i);
+
+ if (terms[*nterms].a < 0) {
+ terms[*nterms].a = -terms[*nterms].a;
+ }
+
+ if (terms[*nterms].a < 0) {
+ /* integer overflow */
+ return 1;
+ }
+
+ terms[*nterms].ub = PyArray_DIM(arr, i) - 1;
+ ++*nterms;
+ }
+
+ return 0;
+}
+
+
+/**
+ * Determine whether two arrays share some memory.
+ *
+ * Returns: 0 (no shared memory), 1 (shared memory), or < 0 (failed to solve).
+ *
+ * Note that failures to solve can occur due to integer overflows, or effort
+ * required solving the problem exceeding max_work. The general problem is
+ * NP-hard and worst case runtime is exponential in the number of dimensions.
+ * max_work controls the amount of work done, either exact (max_work == -1), only
+ * a simple memory extent check (max_work == 0), or set an upper bound
+ * max_work > 0 for the number of solution candidates considered.
+ */
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_may_share_memory(PyArrayObject *a, PyArrayObject *b,
+ Py_ssize_t max_work)
+{
+ npy_int64 rhs;
+ diophantine_term_t terms[2*NPY_MAXDIMS+2];
+ npy_uintp start1 = 0, start2 = 0, end1 = 0, end2 = 0, size1 = 0, size2 = 0;
+ npy_int64 x[2*NPY_MAXDIMS+2];
+ unsigned int nterms;
+
+ get_array_memory_extents(a, &start1, &end1, &size1);
+ get_array_memory_extents(b, &start2, &end2, &size2);
+
+ if (!(start1 < end2 && start2 < end1 && start1 < end1 && start2 < end2)) {
+ /* Memory extents don't overlap */
+ return MEM_OVERLAP_NO;
+ }
+
+ if (max_work == 0) {
+ /* Too much work required, give up */
+ return MEM_OVERLAP_TOO_HARD;
+ }
+
+ /* Convert problem to Diophantine equation form with positive coefficients.
+ The bounds computed by offset_bounds_from_strides correspond to
+ all-positive strides.
+
+ start1 + sum(abs(stride1)*x1)
+ == start2 + sum(abs(stride2)*x2)
+ == end1 - 1 - sum(abs(stride1)*x1')
+ == end2 - 1 - sum(abs(stride2)*x2')
+
+ <=>
+
+ sum(abs(stride1)*x1) + sum(abs(stride2)*x2')
+ == end2 - 1 - start1
+
+ OR
+
+ sum(abs(stride1)*x1') + sum(abs(stride2)*x2)
+ == end1 - 1 - start2
+
+ We pick the problem with the smaller RHS (they are non-negative due to
+ the extent check above.)
+ */
+
+ rhs = MIN(end2 - 1 - start1, end1 - 1 - start2);
+
+ if (rhs != (npy_uintp)rhs) {
+ /* Integer overflow */
+ return MEM_OVERLAP_OVERFLOW;
+ }
+
+ nterms = 0;
+ if (strides_to_terms(a, terms, &nterms, 1)) {
+ return MEM_OVERLAP_OVERFLOW;
+ }
+ if (strides_to_terms(b, terms, &nterms, 1)) {
+ return MEM_OVERLAP_OVERFLOW;
+ }
+ if (PyArray_ITEMSIZE(a) > 1) {
+ terms[nterms].a = 1;
+ terms[nterms].ub = PyArray_ITEMSIZE(a) - 1;
+ ++nterms;
+ }
+ if (PyArray_ITEMSIZE(b) > 1) {
+ terms[nterms].a = 1;
+ terms[nterms].ub = PyArray_ITEMSIZE(b) - 1;
+ ++nterms;
+ }
+
+ /* Simplify, if possible */
+ if (diophantine_simplify(&nterms, terms, rhs)) {
+ /* Integer overflow */
+ return MEM_OVERLAP_OVERFLOW;
+ }
+
+ /* Solve */
+ return solve_diophantine(nterms, terms, rhs, max_work, 0, x);
+}
+
+
+/**
+ * Determine whether an array has internal overlap.
+ *
+ * Returns: 0 (no overlap), 1 (overlap), or < 0 (failed to solve).
+ *
+ * max_work and reasons for solver failures are as in solve_may_share_memory.
+ */
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_may_have_internal_overlap(PyArrayObject *a, Py_ssize_t max_work)
+{
+ diophantine_term_t terms[NPY_MAXDIMS+1];
+ npy_int64 x[NPY_MAXDIMS+1];
+ unsigned int nterms;
+ int i, j;
+
+ if (PyArray_ISCONTIGUOUS(a)) {
+ /* Quick case */
+ return MEM_OVERLAP_NO;
+ }
+
+ /* The internal memory overlap problem is looking for two different
+ solutions to
+
+ sum(a*x) = b, 0 <= x[i] <= ub[i]
+
+ for any b. Equivalently,
+
+ sum(a*x0) - sum(a*x1) = 0
+
+ Mapping the coefficients on the left by x0'[i] = x0[i] if a[i] > 0
+ else ub[i]-x0[i] and opposite for x1, we have
+
+ sum(abs(a)*(x0' + x1')) = sum(abs(a)*ub)
+
+ Now, x0!=x1 if for some i we have x0'[i] + x1'[i] != ub[i].
+ We can now change variables to z[i] = x0'[i] + x1'[i] so the problem
+ becomes
+
+ sum(abs(a)*z) = sum(abs(a)*ub), 0 <= z[i] <= 2*ub[i], z != ub
+
+ This can be solved with solve_diophantine.
+ */
+
+ nterms = 0;
+ if (strides_to_terms(a, terms, &nterms, 0)) {
+ return MEM_OVERLAP_OVERFLOW;
+ }
+ if (PyArray_ITEMSIZE(a) > 1) {
+ terms[nterms].a = 1;
+ terms[nterms].ub = PyArray_ITEMSIZE(a) - 1;
+ ++nterms;
+ }
+
+ /* Get rid of zero coefficients and empty terms */
+ i = 0;
+ for (j = 0; j < nterms; ++j) {
+ if (terms[j].ub == 0) {
+ continue;
+ }
+ else if (terms[j].ub < 0) {
+ return MEM_OVERLAP_NO;
+ }
+ else if (terms[j].a == 0) {
+ return MEM_OVERLAP_YES;
+ }
+ if (i != j) {
+ terms[i] = terms[j];
+ }
+ ++i;
+ }
+ nterms = i;
+
+ /* Double bounds to get the internal overlap problem */
+ for (j = 0; j < nterms; ++j) {
+ terms[j].ub *= 2;
+ }
+
+ /* Sort vs. coefficients; cannot call diophantine_simplify because it may
+ change the decision problem inequality part */
+ qsort(terms, nterms, sizeof(diophantine_term_t), diophantine_sort_A);
+
+ /* Solve */
+ return solve_diophantine(nterms, terms, -1, max_work, 1, x);
+}
diff --git a/numpy/core/src/private/mem_overlap.h b/numpy/core/src/private/mem_overlap.h
new file mode 100644
index 000000000..8044f1663
--- /dev/null
+++ b/numpy/core/src/private/mem_overlap.h
@@ -0,0 +1,50 @@
+#ifndef MEM_OVERLAP_H_
+#define MEM_OVERLAP_H_
+
+#include "npy_config.h"
+#include "numpy/ndarraytypes.h"
+
+
+/* Bounds check only */
+#define NPY_MAY_SHARE_BOUNDS 0
+
+/* Exact solution */
+#define NPY_MAY_SHARE_EXACT -1
+
+
+typedef enum {
+ MEM_OVERLAP_NO = 0, /* no solution exists */
+ MEM_OVERLAP_YES = 1, /* solution found */
+ MEM_OVERLAP_TOO_HARD = -1, /* max_work exceeded */
+ MEM_OVERLAP_OVERFLOW = -2, /* algorithm failed due to integer overflow */
+ MEM_OVERLAP_ERROR = -3 /* invalid input */
+} mem_overlap_t;
+
+
+typedef struct {
+ npy_int64 a;
+ npy_int64 ub;
+} diophantine_term_t;
+
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_diophantine(unsigned int n, diophantine_term_t *E,
+ npy_int64 b, Py_ssize_t max_work, int require_nontrivial,
+ npy_int64 *x);
+
+NPY_VISIBILITY_HIDDEN int
+diophantine_simplify(unsigned int *n, diophantine_term_t *E, npy_int64 b);
+
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_may_share_memory(PyArrayObject *a, PyArrayObject *b,
+ Py_ssize_t max_work);
+
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_may_have_internal_overlap(PyArrayObject *a, Py_ssize_t max_work);
+
+NPY_VISIBILITY_HIDDEN void
+offset_bounds_from_strides(const int itemsize, const int nd,
+ const npy_intp *dims, const npy_intp *strides,
+ npy_intp *lower_offset, npy_intp *upper_offset);
+
+#endif
+
diff --git a/numpy/core/src/private/npy_extint128.h b/numpy/core/src/private/npy_extint128.h
new file mode 100644
index 000000000..6a35e736f
--- /dev/null
+++ b/numpy/core/src/private/npy_extint128.h
@@ -0,0 +1,317 @@
+#ifndef NPY_EXTINT128_H_
+#define NPY_EXTINT128_H_
+
+
+typedef struct {
+ char sign;
+ npy_uint64 lo, hi;
+} npy_extint128_t;
+
+
+/* Integer addition with overflow checking */
+static NPY_INLINE npy_int64
+safe_add(npy_int64 a, npy_int64 b, char *overflow_flag)
+{
+ if (a > 0 && b > NPY_MAX_INT64 - a) {
+ *overflow_flag = 1;
+ }
+ else if (a < 0 && b < NPY_MIN_INT64 - a) {
+ *overflow_flag = 1;
+ }
+ return a + b;
+}
+
+
+/* Integer subtraction with overflow checking */
+static NPY_INLINE npy_int64
+safe_sub(npy_int64 a, npy_int64 b, char *overflow_flag)
+{
+ if (a >= 0 && b < a - NPY_MAX_INT64) {
+ *overflow_flag = 1;
+ }
+ else if (a < 0 && b > a - NPY_MIN_INT64) {
+ *overflow_flag = 1;
+ }
+ return a - b;
+}
+
+
+/* Integer multiplication with overflow checking */
+static NPY_INLINE npy_int64
+safe_mul(npy_int64 a, npy_int64 b, char *overflow_flag)
+{
+ if (a > 0) {
+ if (b > NPY_MAX_INT64 / a || b < NPY_MIN_INT64 / a) {
+ *overflow_flag = 1;
+ }
+ }
+ else if (a < 0) {
+ if (b > 0 && a < NPY_MIN_INT64 / b) {
+ *overflow_flag = 1;
+ }
+ else if (b < 0 && a < NPY_MAX_INT64 / b) {
+ *overflow_flag = 1;
+ }
+ }
+ return a * b;
+}
+
+
+/* Long integer init */
+static NPY_INLINE npy_extint128_t
+to_128(npy_int64 x)
+{
+ npy_extint128_t result;
+ result.sign = (x >= 0 ? 1 : -1);
+ if (x >= 0) {
+ result.lo = x;
+ }
+ else {
+ result.lo = (npy_uint64)(-(x + 1)) + 1;
+ }
+ result.hi = 0;
+ return result;
+}
+
+
+static NPY_INLINE npy_int64
+to_64(npy_extint128_t x, char *overflow)
+{
+ if (x.hi != 0 ||
+ (x.sign > 0 && x.lo > NPY_MAX_INT64) ||
+ (x.sign < 0 && x.lo != 0 && x.lo - 1 > -(NPY_MIN_INT64 + 1))) {
+ *overflow = 1;
+ }
+ return x.lo * x.sign;
+}
+
+
+/* Long integer multiply */
+static NPY_INLINE npy_extint128_t
+mul_64_64(npy_int64 a, npy_int64 b)
+{
+ npy_extint128_t x, y, z;
+ npy_uint64 x1, x2, y1, y2, r1, r2, prev;
+
+ x = to_128(a);
+ y = to_128(b);
+
+ x1 = x.lo & 0xffffffff;
+ x2 = x.lo >> 32;
+
+ y1 = y.lo & 0xffffffff;
+ y2 = y.lo >> 32;
+
+ r1 = x1*y2;
+ r2 = x2*y1;
+
+ z.sign = x.sign * y.sign;
+ z.hi = x2*y2 + (r1 >> 32) + (r2 >> 32);
+ z.lo = x1*y1;
+
+ /* Add with carry */
+ prev = z.lo;
+ z.lo += (r1 << 32);
+ if (z.lo < prev) {
+ ++z.hi;
+ }
+
+ prev = z.lo;
+ z.lo += (r2 << 32);
+ if (z.lo < prev) {
+ ++z.hi;
+ }
+
+ return z;
+}
+
+
+/* Long integer add */
+static NPY_INLINE npy_extint128_t
+add_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
+{
+ npy_extint128_t z;
+
+ if (x.sign == y.sign) {
+ z.sign = x.sign;
+ z.hi = x.hi + y.hi;
+ if (z.hi < x.hi) {
+ *overflow = 1;
+ }
+ z.lo = x.lo + y.lo;
+ if (z.lo < x.lo) {
+ if (z.hi == NPY_MAX_UINT64) {
+ *overflow = 1;
+ }
+ ++z.hi;
+ }
+ }
+ else if (x.hi > y.hi || (x.hi == y.hi && x.lo >= y.lo)) {
+ z.sign = x.sign;
+ z.hi = x.hi - y.hi;
+ z.lo = x.lo;
+ z.lo -= y.lo;
+ if (z.lo > x.lo) {
+ --z.hi;
+ }
+ }
+ else {
+ z.sign = y.sign;
+ z.hi = y.hi - x.hi;
+ z.lo = y.lo;
+ z.lo -= x.lo;
+ if (z.lo > y.lo) {
+ --z.hi;
+ }
+ }
+
+ return z;
+}
+
+
+/* Long integer negation */
+static NPY_INLINE npy_extint128_t
+neg_128(npy_extint128_t x)
+{
+ npy_extint128_t z = x;
+ z.sign *= -1;
+ return z;
+}
+
+
+static NPY_INLINE npy_extint128_t
+sub_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
+{
+ return add_128(x, neg_128(y), overflow);
+}
+
+
+static NPY_INLINE npy_extint128_t
+shl_128(npy_extint128_t v)
+{
+ npy_extint128_t z;
+ z = v;
+ z.hi <<= 1;
+ z.hi |= (z.lo & (((npy_uint64)1) << 63)) >> 63;
+ z.lo <<= 1;
+ return z;
+}
+
+
+static NPY_INLINE npy_extint128_t
+shr_128(npy_extint128_t v)
+{
+ npy_extint128_t z;
+ z = v;
+ z.lo >>= 1;
+ z.lo |= (z.hi & 0x1) << 63;
+ z.hi >>= 1;
+ return z;
+}
+
+static NPY_INLINE int
+gt_128(npy_extint128_t a, npy_extint128_t b)
+{
+ if (a.sign > 0 && b.sign > 0) {
+ return (a.hi > b.hi) || (a.hi == b.hi && a.lo > b.lo);
+ }
+ else if (a.sign < 0 && b.sign < 0) {
+ return (a.hi < b.hi) || (a.hi == b.hi && a.lo < b.lo);
+ }
+ else if (a.sign > 0 && b.sign < 0) {
+ return a.hi != 0 || a.lo != 0 || b.hi != 0 || b.lo != 0;
+ }
+ else {
+ return 0;
+ }
+}
+
+
+/* Long integer divide */
+static NPY_INLINE npy_extint128_t
+divmod_128_64(npy_extint128_t x, npy_int64 b, npy_int64 *mod)
+{
+ npy_extint128_t remainder, pointer, result, divisor;
+ char overflow = 0;
+
+ assert(b > 0);
+
+ if (b <= 1 || x.hi == 0) {
+ result.sign = x.sign;
+ result.lo = x.lo / b;
+ result.hi = x.hi / b;
+ *mod = x.sign * (x.lo % b);
+ return result;
+ }
+
+ /* Long division, not the most efficient choice */
+ remainder = x;
+ remainder.sign = 1;
+
+ divisor.sign = 1;
+ divisor.hi = 0;
+ divisor.lo = b;
+
+ result.sign = 1;
+ result.lo = 0;
+ result.hi = 0;
+
+ pointer.sign = 1;
+ pointer.lo = 1;
+ pointer.hi = 0;
+
+ while ((divisor.hi & (((npy_uint64)1) << 63)) == 0 &&
+ gt_128(remainder, divisor)) {
+ divisor = shl_128(divisor);
+ pointer = shl_128(pointer);
+ }
+
+ while (pointer.lo || pointer.hi) {
+ if (!gt_128(divisor, remainder)) {
+ remainder = sub_128(remainder, divisor, &overflow);
+ result = add_128(result, pointer, &overflow);
+ }
+ divisor = shr_128(divisor);
+ pointer = shr_128(pointer);
+ }
+
+ /* Fix signs and return; cannot overflow */
+ result.sign = x.sign;
+ *mod = x.sign * remainder.lo;
+
+ return result;
+}
+
+
+/* Divide and round down (positive divisor; no overflows) */
+static NPY_INLINE npy_extint128_t
+floordiv_128_64(npy_extint128_t a, npy_int64 b)
+{
+ npy_extint128_t result;
+ npy_int64 remainder;
+ char overflow = 0;
+ assert(b > 0);
+ result = divmod_128_64(a, b, &remainder);
+ if (a.sign < 0 && remainder != 0) {
+ result = sub_128(result, to_128(1), &overflow);
+ }
+ return result;
+}
+
+
+/* Divide and round up (positive divisor; no overflows) */
+static NPY_INLINE npy_extint128_t
+ceildiv_128_64(npy_extint128_t a, npy_int64 b)
+{
+ npy_extint128_t result;
+ npy_int64 remainder;
+ char overflow = 0;
+ assert(b > 0);
+ result = divmod_128_64(a, b, &remainder);
+ if (a.sign > 0 && remainder != 0) {
+ result = add_128(result, to_128(1), &overflow);
+ }
+ return result;
+}
+
+#endif
diff --git a/numpy/core/tests/test_extint128.py b/numpy/core/tests/test_extint128.py
new file mode 100644
index 000000000..2afae2f6b
--- /dev/null
+++ b/numpy/core/tests/test_extint128.py
@@ -0,0 +1,225 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import itertools
+import contextlib
+import operator
+
+import numpy as np
+import numpy.core.multiarray_tests as mt
+from numpy.compat import long
+
+from numpy.testing import assert_raises, assert_equal
+
+
+INT64_MAX = np.iinfo(np.int64).max
+INT64_MIN = np.iinfo(np.int64).min
+INT64_MID = 2**32
+
+# int128 is not two's complement, the sign bit is separate
+INT128_MAX = 2**128 - 1
+INT128_MIN = -INT128_MAX
+INT128_MID = 2**64
+
+INT64_VALUES = (
+ [INT64_MIN + j for j in range(20)] +
+ [INT64_MAX - j for j in range(20)] +
+ [INT64_MID + j for j in range(-20, 20)] +
+ [2*INT64_MID + j for j in range(-20, 20)] +
+ [INT64_MID//2 + j for j in range(-20, 20)] +
+ list(range(-70, 70))
+)
+
+INT128_VALUES = (
+ [INT128_MIN + j for j in range(20)] +
+ [INT128_MAX - j for j in range(20)] +
+ [INT128_MID + j for j in range(-20, 20)] +
+ [2*INT128_MID + j for j in range(-20, 20)] +
+ [INT128_MID//2 + j for j in range(-20, 20)] +
+ list(range(-70, 70)) +
+ [False] # negative zero
+)
+
+INT64_POS_VALUES = [x for x in INT64_VALUES if x > 0]
+
+
+@contextlib.contextmanager
+def exc_iter(*args):
+ """
+ Iterate over Cartesian product of *args, and if an exception is raised,
+ add information of the current iterate.
+ """
+
+ value = [None]
+
+ def iterate():
+ for v in itertools.product(*args):
+ value[0] = v
+ yield v
+
+ try:
+ yield iterate()
+ except:
+ import traceback
+ msg = "At: %r\n%s" % (repr(value[0]),
+ traceback.format_exc())
+ raise AssertionError(msg)
+
+
+def test_safe_binop():
+ # Test checked arithmetic routines
+
+ ops = [
+ (operator.add, 1),
+ (operator.sub, 2),
+ (operator.mul, 3)
+ ]
+
+ with exc_iter(ops, INT64_VALUES, INT64_VALUES) as it:
+ for xop, a, b in it:
+ pyop, op = xop
+ c = pyop(a, b)
+
+ if not (INT64_MIN <= c <= INT64_MAX):
+ assert_raises(OverflowError, mt.extint_safe_binop, a, b, op)
+ else:
+ d = mt.extint_safe_binop(a, b, op)
+ if c != d:
+ # assert_equal is slow
+ assert_equal(d, c)
+
+
+def test_to_128():
+ with exc_iter(INT64_VALUES) as it:
+ for a, in it:
+ b = mt.extint_to_128(a)
+ if a != b:
+ assert_equal(b, a)
+
+
+def test_to_64():
+ with exc_iter(INT128_VALUES) as it:
+ for a, in it:
+ if not (INT64_MIN <= a <= INT64_MAX):
+ assert_raises(OverflowError, mt.extint_to_64, a)
+ else:
+ b = mt.extint_to_64(a)
+ if a != b:
+ assert_equal(b, a)
+
+
+def test_mul_64_64():
+ with exc_iter(INT64_VALUES, INT64_VALUES) as it:
+ for a, b in it:
+ c = a * b
+ d = mt.extint_mul_64_64(a, b)
+ if c != d:
+ assert_equal(d, c)
+
+
+def test_add_128():
+ with exc_iter(INT128_VALUES, INT128_VALUES) as it:
+ for a, b in it:
+ c = a + b
+ if not (INT128_MIN <= c <= INT128_MAX):
+ assert_raises(OverflowError, mt.extint_add_128, a, b)
+ else:
+ d = mt.extint_add_128(a, b)
+ if c != d:
+ assert_equal(d, c)
+
+
+def test_sub_128():
+ with exc_iter(INT128_VALUES, INT128_VALUES) as it:
+ for a, b in it:
+ c = a - b
+ if not (INT128_MIN <= c <= INT128_MAX):
+ assert_raises(OverflowError, mt.extint_sub_128, a, b)
+ else:
+ d = mt.extint_sub_128(a, b)
+ if c != d:
+ assert_equal(d, c)
+
+
+def test_neg_128():
+ with exc_iter(INT128_VALUES) as it:
+ for a, in it:
+ b = -a
+ c = mt.extint_neg_128(a)
+ if b != c:
+ assert_equal(c, b)
+
+
+def test_shl_128():
+ with exc_iter(INT128_VALUES) as it:
+ for a, in it:
+ if a < 0:
+ b = -(((-a) << 1) & (2**128-1))
+ else:
+ b = (a << 1) & (2**128-1)
+ c = mt.extint_shl_128(a)
+ if b != c:
+ assert_equal(c, b)
+
+
+def test_shr_128():
+ with exc_iter(INT128_VALUES) as it:
+ for a, in it:
+ if a < 0:
+ b = -((-a) >> 1)
+ else:
+ b = a >> 1
+ c = mt.extint_shr_128(a)
+ if b != c:
+ assert_equal(c, b)
+
+
+def test_gt_128():
+ with exc_iter(INT128_VALUES, INT128_VALUES) as it:
+ for a, b in it:
+ c = a > b
+ d = mt.extint_gt_128(a, b)
+ if c != d:
+ assert_equal(d, c)
+
+
+def test_divmod_128_64():
+ with exc_iter(INT128_VALUES, INT64_POS_VALUES) as it:
+ for a, b in it:
+ if a >= 0:
+ c, cr = divmod(a, b)
+ else:
+ c, cr = divmod(-a, b)
+ c = -c
+ cr = -cr
+
+ d, dr = mt.extint_divmod_128_64(a, b)
+
+ if c != d or d != dr or b*d + dr != a:
+ assert_equal(d, c)
+ assert_equal(dr, cr)
+ assert_equal(b*d + dr, a)
+
+
+def test_floordiv_128_64():
+ with exc_iter(INT128_VALUES, INT64_POS_VALUES) as it:
+ for a, b in it:
+ c = a // b
+ d = mt.extint_floordiv_128_64(a, b)
+
+ if c != d:
+ assert_equal(d, c)
+
+
+def test_ceildiv_128_64():
+ with exc_iter(INT128_VALUES, INT64_POS_VALUES) as it:
+ for a, b in it:
+ c = (a + b - 1) // b
+ d = mt.extint_ceildiv_128_64(a, b)
+
+ if c != d:
+ assert_equal(d, c)
+
+
+if __name__ == "__main__":
+ run_module_suite()
diff --git a/numpy/core/tests/test_mem_overlap.py b/numpy/core/tests/test_mem_overlap.py
new file mode 100644
index 000000000..728cc675d
--- /dev/null
+++ b/numpy/core/tests/test_mem_overlap.py
@@ -0,0 +1,486 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import itertools
+
+import numpy as np
+from numpy.testing import run_module_suite, assert_, assert_raises, assert_equal
+
+from numpy.core.multiarray_tests import solve_diophantine, internal_overlap
+from numpy.lib.stride_tricks import as_strided
+from numpy.compat import long
+
+if sys.version_info[0] >= 3:
+ xrange = range
+
+
+ndims = 2
+size = 10
+shape = tuple([size] * ndims)
+
+MAY_SHARE_BOUNDS = 0
+MAY_SHARE_EXACT = -1
+
+
+def _indices_for_nelems(nelems):
+ """Returns slices of length nelems, from start onwards, in direction sign."""
+
+ if nelems == 0:
+ return [size // 2] # int index
+
+ res = []
+ for step in (1, 2):
+ for sign in (-1, 1):
+ start = size // 2 - nelems * step * sign // 2
+ stop = start + nelems * step * sign
+ res.append(slice(start, stop, step * sign))
+
+ return res
+
+
+def _indices_for_axis():
+ """Returns (src, dst) pairs of indices."""
+
+ res = []
+ for nelems in (0, 2, 3):
+ ind = _indices_for_nelems(nelems)
+
+ # no itertools.product available in Py2.4
+ res.extend([(a, b) for a in ind for b in ind]) # all assignments of size "nelems"
+
+ return res
+
+
+def _indices(ndims):
+ """Returns ((axis0_src, axis0_dst), (axis1_src, axis1_dst), ... ) index pairs."""
+
+ ind = _indices_for_axis()
+
+ # no itertools.product available in Py2.4
+
+ res = [[]]
+ for i in range(ndims):
+ newres = []
+ for elem in ind:
+ for others in res:
+ newres.append([elem] + others)
+ res = newres
+
+ return res
+
+
+def _check_assignment(srcidx, dstidx):
+ """Check assignment arr[dstidx] = arr[srcidx] works."""
+
+ arr = np.arange(np.product(shape)).reshape(shape)
+
+ cpy = arr.copy()
+
+ cpy[dstidx] = arr[srcidx]
+ arr[dstidx] = arr[srcidx]
+
+ assert np.all(arr == cpy), 'assigning arr[%s] = arr[%s]' % (dstidx, srcidx)
+
+
+def test_overlapping_assignments():
+ """Test automatically generated assignments which overlap in memory."""
+
+ inds = _indices(ndims)
+
+ for ind in inds:
+ srcidx = tuple([a[0] for a in ind])
+ dstidx = tuple([a[1] for a in ind])
+
+ yield _check_assignment, srcidx, dstidx
+
+
+def test_diophantine_fuzz():
+ # Fuzz test the diophantine solver
+ rng = np.random.RandomState(1234)
+
+ max_int = np.iinfo(np.intp).max
+
+ for ndim in range(10):
+ feasible_count = 0
+ infeasible_count = 0
+
+ min_count = 500//(ndim + 1)
+
+ numbers = []
+ while min(feasible_count, infeasible_count) < min_count:
+ # Ensure big and small integer problems
+ A_max = 1 + rng.randint(0, 11)**6
+ U_max = rng.randint(0, 11)**6
+
+ A_max = min(max_int, A_max)
+ U_max = min(max_int-1, U_max)
+
+ A = tuple(rng.randint(1, A_max+1) for j in range(ndim))
+ U = tuple(rng.randint(0, U_max+2) for j in range(ndim))
+
+ b_ub = min(max_int-2, sum(a*ub for a, ub in zip(A, U)))
+ b = rng.randint(-1, b_ub+2)
+
+ if ndim == 0 and feasible_count < min_count:
+ b = 0
+
+ X = solve_diophantine(A, U, b)
+
+ if X is None:
+ # Check the simplified decision problem agrees
+ X_simplified = solve_diophantine(A, U, b, simplify=1)
+ assert X_simplified is None, (A, U, b, X_simplified)
+
+ # Check no solution exists (provided the problem is
+ # small enough so that brute force checking doesn't
+ # take too long)
+ try:
+ ranges = tuple(xrange(0, a*ub+1, a) for a, ub in zip(A, U))
+ except OverflowError:
+ # xrange on 32-bit Python 2 may overflow
+ continue
+
+ size = 1
+ for r in ranges:
+ size *= len(r)
+ if size < 100000:
+ assert_(not any(sum(w) == b for w in itertools.product(*ranges)))
+ infeasible_count += 1
+ else:
+ # Check the simplified decision problem agrees
+ X_simplified = solve_diophantine(A, U, b, simplify=1)
+ assert X_simplified is not None, (A, U, b, X_simplified)
+
+ # Check validity
+ assert_(sum(a*x for a, x in zip(A, X)) == b)
+ assert_(all(0 <= x <= ub for x, ub in zip(X, U)))
+ feasible_count += 1
+
+
+def test_diophantine_overflow():
+ # Smoke test integer overflow detection
+ max_intp = np.iinfo(np.intp).max
+ max_int64 = np.iinfo(np.int64).max
+
+ if max_int64 <= max_intp:
+ # Check that the algorithm works internally in 128-bit;
+ # solving this problem requires large intermediate numbers
+ A = (max_int64//2, max_int64//2 - 10)
+ U = (max_int64//2, max_int64//2 - 10)
+ b = 2*(max_int64//2) - 10
+
+ assert_equal(solve_diophantine(A, U, b), (1, 1))
+
+
+def check_may_share_memory_exact(a, b):
+ got = np.may_share_memory(a, b, max_work=MAY_SHARE_EXACT)
+
+ assert_equal(np.may_share_memory(a, b),
+ np.may_share_memory(a, b, max_work=MAY_SHARE_BOUNDS))
+
+ a.fill(0)
+ b.fill(0)
+ a.fill(1)
+ exact = b.any()
+
+ err_msg = ""
+ if got != exact:
+ err_msg = " " + "\n ".join([
+ "base_a - base_b = %r" % (a.__array_interface__['data'][0] - b.__array_interface__['data'][0],),
+ "shape_a = %r" % (a.shape,),
+ "shape_b = %r" % (b.shape,),
+ "strides_a = %r" % (a.strides,),
+ "strides_b = %r" % (b.strides,),
+ "size_a = %r" % (a.size,),
+ "size_b = %r" % (b.size,)
+ ])
+
+ assert_equal(got, exact, err_msg=err_msg)
+
+
+def test_may_share_memory_manual():
+ # Manual test cases for may_share_memory
+
+ # Base arrays
+ xs0 = [
+ np.zeros([13, 21, 23, 22], dtype=np.int8),
+ np.zeros([13, 21, 23*2, 22], dtype=np.int8)[:,:,::2,:]
+ ]
+
+ # Generate all negative stride combinations
+ xs = []
+ for x in xs0:
+ for ss in itertools.product(*(([slice(None), slice(None, None, -1)],)*4)):
+ xp = x[ss]
+ xs.append(xp)
+
+ for x in xs:
+ # The default is a simple extent check
+ assert_(np.may_share_memory(x[:,0,:], x[:,1,:]))
+ assert_(np.may_share_memory(x[:,0,:], x[:,1,:], max_work=None))
+
+ # Exact checks
+ check_may_share_memory_exact(x[:,0,:], x[:,1,:])
+ check_may_share_memory_exact(x[:,::7], x[:,3::3])
+
+ try:
+ xp = x.ravel()
+ if xp.flags.owndata:
+ continue
+ xp = xp.view(np.int16)
+ except ValueError:
+ continue
+
+ # 0-size arrays cannot overlap
+ check_may_share_memory_exact(x.ravel()[6:6],
+ xp.reshape(13, 21, 23, 11)[:,::7])
+
+ # Test itemsize is dealt with
+ check_may_share_memory_exact(x[:,::7],
+ xp.reshape(13, 21, 23, 11))
+ check_may_share_memory_exact(x[:,::7],
+ xp.reshape(13, 21, 23, 11)[:,3::3])
+ check_may_share_memory_exact(x.ravel()[6:7],
+ xp.reshape(13, 21, 23, 11)[:,::7])
+
+ # Check unit size
+ x = np.zeros([1], dtype=np.int8)
+ check_may_share_memory_exact(x, x)
+ check_may_share_memory_exact(x, x.copy())
+
+
+def check_may_share_memory_easy_fuzz(get_max_work, same_steps, min_count):
+ # Check that overlap problems with common strides are solved with
+ # little work.
+ x = np.zeros([17,34,71,97], dtype=np.int16)
+
+ rng = np.random.RandomState(1234)
+
+ def random_slice(n, step):
+ start = rng.randint(0, n+1)
+ stop = rng.randint(start, n+1)
+ if rng.randint(0, 2) == 0:
+ stop, start = start, stop
+ step *= -1
+ return slice(start, stop, step)
+
+ feasible = 0
+ infeasible = 0
+
+ while min(feasible, infeasible) < min_count:
+ steps = tuple(rng.randint(1, 11) if rng.randint(0, 5) == 0 else 1
+ for j in range(x.ndim))
+ if same_steps:
+ steps2 = steps
+ else:
+ steps2 = tuple(rng.randint(1, 11) if rng.randint(0, 5) == 0 else 1
+ for j in range(x.ndim))
+
+ t1 = np.arange(x.ndim)
+ rng.shuffle(t1)
+
+ t2 = np.arange(x.ndim)
+ rng.shuffle(t2)
+
+ s1 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps))
+ s2 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps2))
+ a = x[s1].transpose(t1)
+ b = x[s2].transpose(t2)
+
+ bounds_overlap = np.may_share_memory(a, b)
+ may_share_answer = np.may_share_memory(a, b)
+ easy_answer = np.may_share_memory(a, b, max_work=get_max_work(a, b))
+ exact_answer = np.may_share_memory(a, b, max_work=MAY_SHARE_EXACT)
+
+ if easy_answer != exact_answer:
+ # assert_equal is slow...
+ assert_equal(easy_answer, exact_answer, err_msg=repr((s1, s2)))
+
+ if may_share_answer != bounds_overlap:
+ assert_equal(may_share_answer, bounds_overlap,
+ err_msg=repr((s1, s2)))
+
+ if bounds_overlap:
+ if exact_answer:
+ feasible += 1
+ else:
+ infeasible += 1
+
+
+def test_may_share_memory_easy_fuzz():
+ # Check that overlap problems with common strides are always
+ # solved with little work.
+
+ check_may_share_memory_easy_fuzz(get_max_work=lambda a, b: 1,
+ same_steps=True,
+ min_count=2000)
+
+
+def test_may_share_memory_harder_fuzz():
+ # Overlap problems with not necessarily common strides take more
+ # work.
+ #
+ # The work bound below can't be reduced much. Harder problems can
+ # also exist but not be detected here, as the set of problems
+ # comes from RNG.
+
+ check_may_share_memory_easy_fuzz(get_max_work=lambda a, b: max(a.size, b.size)//2,
+ same_steps=False,
+ min_count=2000)
+
+
+def test_shares_memory_api():
+ x = np.zeros([4, 5, 6], dtype=np.int8)
+
+ assert_equal(np.shares_memory(x, x), True)
+ assert_equal(np.shares_memory(x, x.copy()), False)
+
+ a = x[:,::2,::3]
+ b = x[:,::3,::2]
+ assert_equal(np.shares_memory(a, b), True)
+ assert_equal(np.shares_memory(a, b, max_work=None), True)
+ assert_raises(np.TooHardError, np.shares_memory, a, b, max_work=1)
+ assert_raises(np.TooHardError, np.shares_memory, a, b, max_work=long(1))
+
+
+def test_internal_overlap_diophantine():
+ def check(A, U, exists=None):
+ X = solve_diophantine(A, U, 0, require_ub_nontrivial=1)
+
+ if exists is None:
+ exists = (X is not None)
+
+ if X is not None:
+ assert_(sum(a*x for a, x in zip(A, X)) == sum(a*u//2 for a, u in zip(A, U)))
+ assert_(all(0 <= x <= u for x, u in zip(X, U)))
+ assert_(any(x != u//2 for x, u in zip(X, U)))
+
+ if exists:
+ assert_(X is not None, repr(X))
+ else:
+ assert_(X is None, repr(X))
+
+ # Smoke tests
+ check((3, 2), (2*2, 3*2), exists=True)
+ check((3*2, 2), (15*2, (3-1)*2), exists=False)
+
+
+def test_internal_overlap_slices():
+ # Slicing an array never generates internal overlap
+
+ x = np.zeros([17,34,71,97], dtype=np.int16)
+
+ rng = np.random.RandomState(1234)
+
+ def random_slice(n, step):
+ start = rng.randint(0, n+1)
+ stop = rng.randint(start, n+1)
+ if rng.randint(0, 2) == 0:
+ stop, start = start, stop
+ step *= -1
+ return slice(start, stop, step)
+
+ cases = 0
+ min_count = 5000
+
+ while cases < min_count:
+ steps = tuple(rng.randint(1, 11) if rng.randint(0, 5) == 0 else 1
+ for j in range(x.ndim))
+ t1 = np.arange(x.ndim)
+ rng.shuffle(t1)
+ s1 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps))
+ a = x[s1].transpose(t1)
+
+ assert not internal_overlap(a)
+ cases += 1
+
+
+def check_internal_overlap(a, manual_expected=None):
+ got = internal_overlap(a)
+
+ # Brute-force check
+ m = set()
+ ranges = tuple(xrange(n) for n in a.shape)
+ for v in itertools.product(*ranges):
+ offset = sum(s*w for s, w in zip(a.strides, v))
+ if offset in m:
+ expected = True
+ break
+ else:
+ m.add(offset)
+ else:
+ expected = False
+
+ # Compare
+ if got != expected:
+ assert_equal(got, expected, err_msg=repr((a.strides, a.shape)))
+ if manual_expected is not None and expected != manual_expected:
+ assert_equal(expected, manual_expected)
+ return got
+
+
+def test_internal_overlap_manual():
+ # Stride tricks can construct arrays with internal overlap
+
+ # We don't care about memory bounds, the array is not
+ # read/write accessed
+ x = np.arange(1).astype(np.int8)
+
+ # Check low-dimensional special cases
+
+ check_internal_overlap(x, False) # 1-dim
+ check_internal_overlap(x.reshape([]), False) # 0-dim
+
+ a = as_strided(x, strides=(3, 4), shape=(4, 4))
+ check_internal_overlap(a, False)
+
+ a = as_strided(x, strides=(3, 4), shape=(5, 4))
+ check_internal_overlap(a, True)
+
+ a = as_strided(x, strides=(0,), shape=(0,))
+ check_internal_overlap(a, False)
+
+ a = as_strided(x, strides=(0,), shape=(1,))
+ check_internal_overlap(a, False)
+
+ a = as_strided(x, strides=(0,), shape=(2,))
+ check_internal_overlap(a, True)
+
+ a = as_strided(x, strides=(0, -9993), shape=(87, 22))
+ check_internal_overlap(a, True)
+
+ a = as_strided(x, strides=(0, -9993), shape=(1, 22))
+ check_internal_overlap(a, False)
+
+ a = as_strided(x, strides=(0, -9993), shape=(0, 22))
+ check_internal_overlap(a, False)
+
+
+def test_internal_overlap_fuzz():
+ # Fuzz check; the brute-force check is fairly slow
+
+ x = np.arange(1).astype(np.int8)
+
+ overlap = 0
+ no_overlap = 0
+ min_count = 100
+
+ rng = np.random.RandomState(1234)
+
+ while min(overlap, no_overlap) < min_count:
+ ndim = rng.randint(1, 4)
+
+ strides = tuple(rng.randint(-1000, 1000) for j in range(ndim))
+ shape = tuple(rng.randint(1, 30) for j in range(ndim))
+
+ a = as_strided(x, strides=strides, shape=shape)
+ result = check_internal_overlap(a)
+
+ if result:
+ overlap += 1
+ else:
+ no_overlap += 1
+
+
+if __name__ == "__main__":
+ run_module_suite()
diff --git a/numpy/core/tests/test_multiarray_assignment.py b/numpy/core/tests/test_multiarray_assignment.py
deleted file mode 100644
index 86e1b125e..000000000
--- a/numpy/core/tests/test_multiarray_assignment.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from __future__ import division, absolute_import, print_function
-
-import numpy as np
-from numpy.testing import run_module_suite
-
-ndims = 2
-size = 10
-shape = tuple([size] * ndims)
-
-
-def _indices_for_nelems(nelems):
- """Returns slices of length nelems, from start onwards, in direction sign."""
-
- if nelems == 0:
- return [size // 2] # int index
-
- res = []
- for step in (1, 2):
- for sign in (-1, 1):
- start = size // 2 - nelems * step * sign // 2
- stop = start + nelems * step * sign
- res.append(slice(start, stop, step * sign))
-
- return res
-
-
-def _indices_for_axis():
- """Returns (src, dst) pairs of indices."""
-
- res = []
- for nelems in (0, 2, 3):
- ind = _indices_for_nelems(nelems)
-
- # no itertools.product available in Py2.4
- res.extend([(a, b) for a in ind for b in ind]) # all assignments of size "nelems"
-
- return res
-
-
-def _indices(ndims):
- """Returns ((axis0_src, axis0_dst), (axis1_src, axis1_dst), ... ) index pairs."""
-
- ind = _indices_for_axis()
-
- # no itertools.product available in Py2.4
-
- res = [[]]
- for i in range(ndims):
- newres = []
- for elem in ind:
- for others in res:
- newres.append([elem] + others)
- res = newres
-
- return res
-
-
-def _check_assignment(srcidx, dstidx):
- """Check assignment arr[dstidx] = arr[srcidx] works."""
-
- arr = np.arange(np.product(shape)).reshape(shape)
-
- cpy = arr.copy()
-
- cpy[dstidx] = arr[srcidx]
- arr[dstidx] = arr[srcidx]
-
- assert np.all(arr == cpy), 'assigning arr[%s] = arr[%s]' % (dstidx, srcidx)
-
-
-def test_overlapping_assignments():
- """Test automatically generated assignments which overlap in memory."""
-
- inds = _indices(ndims)
-
- for ind in inds:
- srcidx = tuple([a[0] for a in ind])
- dstidx = tuple([a[1] for a in ind])
-
- yield _check_assignment, srcidx, dstidx
-
-
-if __name__ == "__main__":
- run_module_suite()