23 files changed, 2816 insertions, 164 deletions
diff --git a/doc/release/1.11.0-notes.rst b/doc/release/1.11.0-notes.rst
index cb1b19a92..3ca77eea5 100644
--- a/doc/release/1.11.0-notes.rst
+++ b/doc/release/1.11.0-notes.rst
@@ -29,6 +29,9 @@ as the argument to 'bins' results in the corresponding estimator being used.
 has been added, converting the previous vbench-based one. You can run the suite locally
 via ``python runtests.py --bench``. For more details, see ``benchmarks/README.rst``.
 
+* A new function ``np.shares_memory`` that can check exactly whether two
+arrays have memory overlap is added. ``np.may_share_memory`` also now
+has an option to spend more effort to reduce false positives.
 
 Improvements
 ============
diff --git a/doc/source/reference/routines.other.rst b/doc/source/reference/routines.other.rst
index 354f45733..a3a1f8a06 100644
--- a/doc/source/reference/routines.other.rst
+++ b/doc/source/reference/routines.other.rst
@@ -22,3 +22,12 @@ Performance tuning
    restoredot
    setbufsize
    getbufsize
+
+Memory ranges
+-------------
+
+.. autosummary::
+   :toctree: generated/
+
+   shares_memory
+   may_share_memory
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 195789a39..607e28a28 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -3784,24 +3784,40 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('min',
     """))
 
 
-add_newdoc('numpy.core.multiarray', 'may_share_memory',
+add_newdoc('numpy.core.multiarray', 'shares_memory',
     """
-    Determine if two arrays can share memory
+    shares_memory(a, b, max_work=None)
 
-    The memory-bounds of a and b are computed.  If they overlap then
-    this function returns True.  Otherwise, it returns False.
-
-    A return of True does not necessarily mean that the two arrays
-    share any element.  It just means that they *might*.
+    Determine if two arrays share memory
 
     Parameters
     ----------
     a, b : ndarray
+        Input arrays
+    max_work : int, optional
+        Effort to spend on solving the overlap problem (maximum number
+        of candidate solutions to consider). The following special
+        values are recognized:
+
+        max_work=MAY_SHARE_EXACT  (default)
+            The problem is solved exactly. In this case, the function returns
+            True only if there is an element shared between the arrays.
+        max_work=MAY_SHARE_BOUNDS
+            Only the memory bounds of a and b are checked.
+
+    Raises
+    ------
+    numpy.TooHardError
+        Exceeded max_work.
 
     Returns
     -------
     out : bool
 
+    See Also
+    --------
+    may_share_memory
+
     Examples
     --------
     >>> np.may_share_memory(np.array([1,2]), np.array([5,8,9]))
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index bf492d105..879f4a224 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -759,3 +759,7 @@ def _gcd(a, b):
     while b:
         a, b = b, a % b
     return a
+
+# Exception used in shares_memory()
+class TooHardError(RuntimeError):
+    pass
diff --git a/numpy/core/bento.info b/numpy/core/bento.info
index aaf261ddc..0c335e08a 100644
--- a/numpy/core/bento.info
+++ b/numpy/core/bento.info
@@ -22,7 +22,8 @@ Library:
             src/multiarray/multiarraymodule_onefile.c
     Extension: multiarray_tests
         Sources:
-            src/multiarray/multiarray_tests.c.src
+            src/multiarray/multiarray_tests.c.src,
+            src/private/mem_overlap.c
     Extension: umath
         Sources:
             src/umath/umathmodule_onefile.c
diff --git a/numpy/core/bscript b/numpy/core/bscript
index 4eea31ed7..944c2996f 100644
--- a/numpy/core/bscript
+++ b/numpy/core/bscript
@@ -484,6 +484,7 @@ def pre_build(context):
                 pjoin('src', 'multiarray', 'usertypes.c'),
                 pjoin('src', 'multiarray', 'vdot.c'),
                 pjoin('src', 'private', 'templ_common.h.src'),
+                pjoin('src', 'private', 'mem_overlap.c'),
                 ]
 
             if bld.env.HAS_CBLAS:
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index 532ef2950..05fea557a 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -1,9 +1,9 @@
 from __future__ import division, absolute_import, print_function
 
-__all__ = ['logspace', 'linspace']
+__all__ = ['logspace', 'linspace', 'may_share_memory']
 
 from . import numeric as _nx
-from .numeric import result_type, NaN
+from .numeric import result_type, NaN, shares_memory, MAY_SHARE_BOUNDS, TooHardError
 
 
 def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None):
@@ -201,3 +201,46 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None):
     if dtype is None:
         return _nx.power(base, y)
     return _nx.power(base, y).astype(dtype)
+
+
+def may_share_memory(a, b, max_work=None):
+    """Determine if two arrays can share memory
+
+    A return of True does not necessarily mean that the two arrays
+    share any element.  It just means that they *might*.
+
+    Only the memory bounds of a and b are checked by default.
+
+    Parameters
+    ----------
+    a, b : ndarray
+        Input arrays
+    max_work : int, optional
+        Effort to spend on solving the overlap problem.  See
+        `shares_memory` for details.  Default for ``may_share_memory``
+        is to do a bounds check.
+
+    Returns
+    -------
+    out : bool
+
+    See Also
+    --------
+    shares_memory
+
+    Examples
+    --------
+    >>> np.may_share_memory(np.array([1,2]), np.array([5,8,9]))
+    False
+    >>> x = np.zeros([3, 4])
+    >>> np.may_share_memory(x[:,0], x[:,1])
+    True
+
+    """
+    if max_work is None:
+        max_work = MAY_SHARE_BOUNDS
+    try:
+        return shares_memory(a, b, max_work=max_work)
+    except (TooHardError, OverflowError):
+        # Unable to determine, assume yes
+        return True
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index f0163876f..1b7dfca3e 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -10,6 +10,7 @@ from .umath import (invert, sin, UFUNC_BUFSIZE_DEFAULT, ERR_IGNORE,
                     ERR_DEFAULT, PINF, NAN)
 from . import numerictypes
 from .numerictypes import longlong, intc, int_, float_, complex_, bool_
+from ._internal import TooHardError
 
 if sys.version_info[0] >= 3:
     import pickle
@@ -39,8 +40,8 @@ __all__ = [
     'getbufsize', 'seterrcall', 'geterrcall', 'errstate', 'flatnonzero',
     'Inf', 'inf', 'infty', 'Infinity', 'nan', 'NaN', 'False_', 'True_',
     'bitwise_not', 'CLIP', 'RAISE', 'WRAP', 'MAXDIMS', 'BUFSIZE',
-    'ALLOW_THREADS', 'ComplexWarning', 'may_share_memory', 'full',
-    'full_like', 'matmul',
+    'ALLOW_THREADS', 'ComplexWarning', 'full', 'full_like', 'matmul',
+    'shares_memory', 'MAY_SHARE_BOUNDS', 'MAY_SHARE_EXACT', 'TooHardError',
     ]
 
 if sys.version_info[0] < 3:
@@ -65,6 +66,8 @@ RAISE = multiarray.RAISE
 MAXDIMS = multiarray.MAXDIMS
 ALLOW_THREADS = multiarray.ALLOW_THREADS
 BUFSIZE = multiarray.BUFSIZE
+MAY_SHARE_BOUNDS = multiarray.MAY_SHARE_BOUNDS
+MAY_SHARE_EXACT = multiarray.MAY_SHARE_EXACT
 
 ndarray = multiarray.ndarray
 flatiter = multiarray.flatiter
@@ -375,7 +378,7 @@ fromstring = multiarray.fromstring
 fromiter = multiarray.fromiter
 fromfile = multiarray.fromfile
 frombuffer = multiarray.frombuffer
-may_share_memory = multiarray.may_share_memory
+shares_memory = multiarray.shares_memory
 if sys.version_info[0] < 3:
     newbuffer = multiarray.newbuffer
     getbuffer = multiarray.getbuffer
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 99084b25e..6d9926d89 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -762,6 +762,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'private', 'npy_config.h'),
             join('src', 'private', 'templ_common.h.src'),
             join('src', 'private', 'lowlevel_strided_loops.h'),
+            join('src', 'private', 'mem_overlap.h'),
+            join('src', 'private', 'npy_extint128.h'),
             join('include', 'numpy', 'arrayobject.h'),
             join('include', 'numpy', '_neighborhood_iterator_imp.h'),
             join('include', 'numpy', 'npy_endian.h'),
@@ -831,6 +833,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'ucsnarrow.c'),
             join('src', 'multiarray', 'vdot.c'),
             join('src', 'private', 'templ_common.h.src'),
+            join('src', 'private', 'mem_overlap.c'),
             ]
 
     blas_info = get_info('blas_opt', 0)
@@ -959,7 +962,10 @@ def configuration(parent_package='',top_path=None):
     #######################################################################
 
     config.add_extension('multiarray_tests',
-                    sources=[join('src', 'multiarray', 'multiarray_tests.c.src')])
+                    sources=[join('src', 'multiarray', 'multiarray_tests.c.src'),
+                             join('src', 'private', 'mem_overlap.c')],
+                    depends=[join('src', 'private', 'mem_overlap.h'),
+                             join('src', 'private', 'npy_extint128.h')])
 
     #######################################################################
     #                        operand_flag_tests module                    #
diff --git a/numpy/core/src/multiarray/array_assign.c b/numpy/core/src/multiarray/array_assign.c
index fa764d758..a48e245d8 100644
--- a/numpy/core/src/multiarray/array_assign.c
+++ b/numpy/core/src/multiarray/array_assign.c
@@ -23,6 +23,7 @@
 #include "array_assign.h"
 #include "common.h"
 #include "lowlevel_strided_loops.h"
+#include "mem_overlap.h"
 
 /* See array_assign.h for parameter documentation */
 NPY_NO_EXPORT int
@@ -101,27 +102,17 @@ raw_array_is_aligned(int ndim, char *data, npy_intp *strides, int alignment)
 }
 
 
-/* Gets a half-open range [start, end) which contains the array data */
-NPY_NO_EXPORT void
-get_array_memory_extents(PyArrayObject *arr,
-                         npy_uintp *out_start, npy_uintp *out_end)
-{
-    npy_intp low, upper;
-    offset_bounds_from_strides(PyArray_ITEMSIZE(arr), PyArray_NDIM(arr),
-                               PyArray_DIMS(arr), PyArray_STRIDES(arr),
-                               &low, &upper);
-    *out_start = (npy_uintp)PyArray_DATA(arr) + (npy_uintp)low;
-    *out_end = (npy_uintp)PyArray_DATA(arr) + (npy_uintp)upper;
-}
-
 /* Returns 1 if the arrays have overlapping data, 0 otherwise */
 NPY_NO_EXPORT int
 arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2)
 {
-    npy_uintp start1 = 0, start2 = 0, end1 = 0, end2 = 0;
-
-    get_array_memory_extents(arr1, &start1, &end1);
-    get_array_memory_extents(arr2, &start2, &end2);
+    mem_overlap_t result;
 
-    return (start1 < end2) && (start2 < end1);
+    result = solve_may_share_memory(arr1, arr2, NPY_MAY_SHARE_BOUNDS);
+    if (result == MEM_OVERLAP_NO) {
+        return 0;
+    }
+    else {
+        return 1;
+    }
 }
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 86ea38cd7..fd5b15a0a 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -51,6 +51,7 @@ maintainer email:  oliphant.travis@ieee.org
 #include "buffer.h"
 #include "array_assign.h"
 #include "alloc.h"
+#include "mem_overlap.h"
 
 /*NUMPY_API
   Compute the size of an array (in number of items)
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index a5f3b3d55..3352c3529 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -765,39 +765,6 @@ _IsWriteable(PyArrayObject *ap)
 }
 
 
-/* Gets a half-open range [start, end) of offsets from the data pointer */
-NPY_NO_EXPORT void
-offset_bounds_from_strides(const int itemsize, const int nd,
-                           const npy_intp *dims, const npy_intp *strides,
-                           npy_intp *lower_offset, npy_intp *upper_offset) {
-    npy_intp max_axis_offset;
-    npy_intp lower = 0;
-    npy_intp upper = 0;
-    int i;
-
-    for (i = 0; i < nd; i++) {
-        if (dims[i] == 0) {
-            /* If the array size is zero, return an empty range */
-            *lower_offset = 0;
-            *upper_offset = 0;
-            return;
-        }
-        /* Expand either upwards or downwards depending on stride */
-        max_axis_offset = strides[i] * (dims[i] - 1);
-        if (max_axis_offset > 0) {
-            upper += max_axis_offset;
-        }
-        else {
-            lower += max_axis_offset;
-        }
-    }
-    /* Return a half-open range */
-    upper += itemsize;
-    *lower_offset = lower;
-    *upper_offset = upper;
-}
-
-
 /**
  * Convert an array shape to a string such as "(1, 2)".
  *
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 9cf2e27bf..11993829f 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -65,11 +65,6 @@ _IsAligned(PyArrayObject *ap);
 NPY_NO_EXPORT npy_bool
 _IsWriteable(PyArrayObject *ap);
 
-NPY_NO_EXPORT void
-offset_bounds_from_strides(const int itemsize, const int nd,
-                           const npy_intp *dims, const npy_intp *strides,
-                           npy_intp *lower_offset, npy_intp *upper_offset);
-
 NPY_NO_EXPORT PyObject *
 convert_shape_to_string(npy_intp n, npy_intp *vals, char *ending);
 
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index 0b694deed..5147b9735 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -17,6 +17,7 @@
 #include "descriptor.h"
 #include "getset.h"
 #include "arrayobject.h"
+#include "mem_overlap.h"
 
 /*******************  array attribute get and set routines ******************/
 
diff --git a/numpy/core/src/multiarray/multiarray_tests.c.src b/numpy/core/src/multiarray/multiarray_tests.c.src
index e7f8e36fd..5e247e15e 100644
--- a/numpy/core/src/multiarray/multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/multiarray_tests.c.src
@@ -1,6 +1,9 @@
+/* -*-c-*- */
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #include <Python.h>
 #include "numpy/arrayobject.h"
+#include "mem_overlap.h"
+#include "npy_extint128.h"
 
 /* test PyArray_IsPythonScalar, before including private py3 compat header */
 static PyObject *
@@ -947,6 +950,607 @@ test_nditer_too_large(PyObject *NPY_UNUSED(self), PyObject *args) {
 }
 
 
+static PyObject *
+array_solve_diophantine(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+{
+    PyObject *A = NULL;
+    PyObject *U = NULL;
+    Py_ssize_t b_input = 0;
+    Py_ssize_t max_work = -1;
+    int simplify = 0;
+    int require_ub_nontrivial = 0;
+    static char *kwlist[] = {"A", "U", "b", "max_work", "simplify",
+                             "require_ub_nontrivial", NULL};
+
+    diophantine_term_t terms[2*NPY_MAXDIMS+2];
+    npy_int64 x[2*NPY_MAXDIMS+2];
+    npy_int64 b;
+    unsigned int nterms, j;
+    mem_overlap_t result = MEM_OVERLAP_YES;
+    PyObject *retval = NULL;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!O!n|nii", kwlist,
+                                     &PyTuple_Type, &A,
+                                     &PyTuple_Type, &U,
+                                     &b_input, &max_work, &simplify,
+                                     &require_ub_nontrivial)) {
+        return NULL;
+    }
+
+    if (PyTuple_GET_SIZE(A) > sizeof(terms) / sizeof(diophantine_term_t)) {
+        PyErr_SetString(PyExc_ValueError, "too many terms in equation");
+        goto fail;
+    }
+
+    nterms = PyTuple_GET_SIZE(A);
+
+    if (PyTuple_GET_SIZE(U) != nterms) {
+        PyErr_SetString(PyExc_ValueError, "A, U must be tuples of equal length");
+        goto fail;
+    }
+
+    for (j = 0; j < nterms; ++j) {
+        terms[j].a = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(A, j));
+        if (terms[j].a == -1 && PyErr_Occurred()) {
+            goto fail;
+        }
+        terms[j].ub = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(U, j));
+        if (terms[j].ub == -1 && PyErr_Occurred()) {
+            goto fail;
+        }
+    }
+
+    b = b_input;
+
+    NPY_BEGIN_THREADS;
+    if (simplify && !require_ub_nontrivial) {
+        if (diophantine_simplify(&nterms, terms, b)) {
+            result = MEM_OVERLAP_OVERFLOW;
+        }
+    }
+    if (result == MEM_OVERLAP_YES) {
+        result = solve_diophantine(nterms, terms, b, max_work, require_ub_nontrivial, x);
+    }
+    NPY_END_THREADS;
+
+    if (result == MEM_OVERLAP_YES) {
+        retval = PyTuple_New(nterms);
+        if (retval == NULL) {
+            goto fail;
+        }
+
+        for (j = 0; j < nterms; ++j) {
+            PyObject *obj;
+#if defined(NPY_PY3K)
+            obj = PyLong_FromSsize_t(x[j]);
+#else
+            obj = PyInt_FromSsize_t(x[j]);
+#endif
+            if (obj == NULL) {
+                goto fail;
+            }
+            PyTuple_SET_ITEM(retval, j, obj);
+        }
+    }
+    else if (result == MEM_OVERLAP_NO) {
+        retval = Py_None;
+        Py_INCREF(retval);
+    }
+    else if (result == MEM_OVERLAP_ERROR) {
+        PyErr_SetString(PyExc_ValueError, "Invalid arguments");
+    }
+    else if (result == MEM_OVERLAP_OVERFLOW) {
+        PyErr_SetString(PyExc_OverflowError, "Integer overflow");
+    }
+    else if (result == MEM_OVERLAP_TOO_HARD) {
+        PyErr_SetString(PyExc_RuntimeError, "Too much work done");
+    }
+    else {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown error");
+    }
+
+    return retval;
+
+fail:
+    Py_XDECREF(retval);
+    return NULL;
+}
+
+
+static PyObject *
+array_internal_overlap(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+{
+    PyArrayObject * self = NULL;
+    static char *kwlist[] = {"self", "max_work", NULL};
+
+    mem_overlap_t result;
+    Py_ssize_t max_work = NPY_MAY_SHARE_EXACT;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|n", kwlist,
+                                     PyArray_Converter, &self,
+                                     &max_work)) {
+        return NULL;
+    }
+
+    if (max_work < -2) {
+        PyErr_SetString(PyExc_ValueError, "Invalid value for max_work");
+        goto fail;
+    }
+
+    NPY_BEGIN_THREADS;
+    result = solve_may_have_internal_overlap(self, max_work);
+    NPY_END_THREADS;
+
+    Py_XDECREF(self);
+
+    if (result == MEM_OVERLAP_NO) {
+        Py_RETURN_FALSE;
+    }
+    else if (result == MEM_OVERLAP_YES) {
+        Py_RETURN_TRUE;
+    }
+    else if (result == MEM_OVERLAP_OVERFLOW) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Integer overflow in computing overlap");
+        return NULL;
+    }
+    else if (result == MEM_OVERLAP_TOO_HARD) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Exceeded max_work");
+        return NULL;
+    }
+    else {
+        /* Doesn't happen usually */
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Error in computing overlap");
+        return NULL;
+    }
+
+fail:
+    Py_XDECREF(self);
+    return NULL;
+}
+
+
+static PyObject *
+pylong_from_int128(npy_extint128_t value)
+{
+    PyObject *val_64 = NULL, *val = NULL, *tmp = NULL, *tmp2 = NULL;
+
+    val_64 = PyLong_FromLong(64);
+    if (val_64 == NULL) {
+        goto fail;
+    }
+
+    val = PyLong_FromUnsignedLongLong(value.hi);
+    if (val == NULL) {
+        goto fail;
+    }
+
+    tmp = PyNumber_Lshift(val, val_64);
+    if (tmp == NULL) {
+        goto fail;
+    }
+
+    Py_DECREF(val);
+    val = tmp;
+
+    tmp = PyLong_FromUnsignedLongLong(value.lo);
+    if (tmp == NULL) {
+        goto fail;
+    }
+
+    tmp2 = PyNumber_Or(val, tmp);
+    if (tmp2 == NULL) {
+        goto fail;
+    }
+
+    Py_DECREF(val);
+    Py_DECREF(tmp);
+
+    val = NULL;
+    tmp = NULL;
+
+    if (value.sign < 0) {
+        val = PyNumber_Negative(tmp2);
+        if (val == NULL) {
+            goto fail;
+        }
+        Py_DECREF(tmp2);
+        return val;
+    }
+    else {
+        val = tmp2;
+    }
+    return val;
+
+fail:
+    Py_XDECREF(val_64);
+    Py_XDECREF(tmp);
+    Py_XDECREF(tmp2);
+    Py_XDECREF(val);
+    return NULL;
+}
+
+
+static int
+int128_from_pylong(PyObject *obj, npy_extint128_t *result)
+{
+    PyObject *long_obj = NULL, *val_64 = NULL, *val_0 = NULL,
+        *mask_64 = NULL, *max_128 = NULL, *hi_bits = NULL,
+        *lo_bits = NULL, *tmp = NULL;
+    int cmp;
+    int negative_zero = 0;
+
+    if (PyBool_Check(obj)) {
+        /* False means negative zero */
+        negative_zero = 1;
+    }
+
+    long_obj = PyObject_CallFunction((PyObject*)&PyLong_Type, "O", obj);
+    if (long_obj == NULL) {
+        goto fail;
+    }
+
+    val_0 = PyLong_FromLong(0);
+    if (val_0 == NULL) {
+        goto fail;
+    }
+
+    val_64 = PyLong_FromLong(64);
+    if (val_64 == NULL) {
+        goto fail;
+    }
+
+    mask_64 = PyLong_FromUnsignedLongLong(0xffffffffffffffffULL);
+    if (mask_64 == NULL) {
+        goto fail;
+    }
+
+    tmp = PyNumber_Lshift(mask_64, val_64);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    max_128 = PyNumber_Or(tmp, mask_64);
+    if (max_128 == NULL) {
+        goto fail;
+    }
+    Py_DECREF(tmp);
+    tmp = NULL;
+
+    cmp = PyObject_RichCompareBool(long_obj, val_0, Py_LT);
+    if (cmp == -1) {
+        goto fail;
+    }
+    else if (cmp == 1) {
+        tmp = PyNumber_Negative(long_obj);
+        if (tmp == NULL) {
+            goto fail;
+        }
+        Py_DECREF(long_obj);
+        long_obj = tmp;
+        tmp = NULL;
+        result->sign = -1;
+    }
+    else {
+        result->sign = 1;
+    }
+
+    cmp = PyObject_RichCompareBool(long_obj, max_128, Py_GT);
+    if (cmp == 1) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        goto fail;
+    }
+    else if (cmp == -1) {
+        goto fail;
+    }
+
+    hi_bits = PyNumber_Rshift(long_obj, val_64);
+    if (hi_bits == NULL) {
+        goto fail;
+    }
+
+    lo_bits = PyNumber_And(long_obj, mask_64);
+    if (lo_bits == NULL) {
+        goto fail;
+    }
+
+    result->hi = PyLong_AsUnsignedLongLong(hi_bits);
+    if (result->hi == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) {
+        goto fail;
+    }
+
+    result->lo = PyLong_AsUnsignedLongLong(lo_bits);
+    if (result->lo == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) {
+        goto fail;
+    }
+
+    if (negative_zero && result->hi == 0 && result->lo == 0) {
+        result->sign = -1;
+    }
+
+    Py_XDECREF(long_obj);
+    Py_XDECREF(val_64);
+    Py_XDECREF(val_0);
+    Py_XDECREF(mask_64);
+    Py_XDECREF(max_128);
+    Py_XDECREF(hi_bits);
+    Py_XDECREF(lo_bits);
+    Py_XDECREF(tmp);
+    return 0;
+
+fail:
+    Py_XDECREF(long_obj);
+    Py_XDECREF(val_64);
+    Py_XDECREF(val_0);
+    Py_XDECREF(mask_64);
+    Py_XDECREF(max_128);
+    Py_XDECREF(hi_bits);
+    Py_XDECREF(lo_bits);
+    Py_XDECREF(tmp);
+    return -1;
+}
+
+
+static PyObject *
+extint_safe_binop(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PY_LONG_LONG a, b, c;
+    int op;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "LLi", &a, &b, &op)) {
+        return NULL;
+    }
+    if (op == 1) {
+        c = safe_add(a, b, &overflow);
+    }
+    else if (op == 2) {
+        c = safe_sub(a, b, &overflow);
+    }
+    else if (op == 3) {
+        c = safe_mul(a, b, &overflow);
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError, "invalid op");
+        return NULL;
+    }
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return PyLong_FromLongLong(c);
+}
+
+
+static PyObject *
+extint_to_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PY_LONG_LONG a;
+    if (!PyArg_ParseTuple(args, "L", &a)) {
+        return NULL;
+    }
+    return pylong_from_int128(to_128(a));
+}
+
+
+static PyObject *
+extint_to_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a;
+    PY_LONG_LONG r;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    r = to_64(a, &overflow);
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return PyLong_FromLongLong(r);
+}
+
+
+static PyObject *
+extint_mul_64_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PY_LONG_LONG a, b;
+    npy_extint128_t c;
+    if (!PyArg_ParseTuple(args, "LL", &a, &b)) {
+        return NULL;
+    }
+    c = mul_64_64(a, b);
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_add_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *b_obj;
+    npy_extint128_t a, b, c;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+        return NULL;
+    }
+    c = add_128(a, b, &overflow);
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_sub_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *b_obj;
+    npy_extint128_t a, b, c;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+        return NULL;
+    }
+    c = sub_128(a, b, &overflow);
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_neg_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    b = neg_128(a);
+    return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_shl_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    b = shl_128(a);
+    return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_shr_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    b = shr_128(a);
+    return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_gt_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *b_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+        return NULL;
+    }
+    if (gt_128(a, b)) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+
+static PyObject *
+extint_divmod_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *ret = NULL, *tmp = NULL;
+    npy_extint128_t a, c;
+    PY_LONG_LONG b;
+    npy_int64 mod;
+    if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+        goto fail;
+    }
+    if (b <= 0) {
+        PyErr_SetString(PyExc_ValueError, "");
+        goto fail;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        goto fail;
+    }
+
+    c = divmod_128_64(a, b, &mod);
+
+    ret = PyTuple_New(2);
+
+    tmp = pylong_from_int128(c);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    PyTuple_SET_ITEM(ret, 0, tmp);
+
+    tmp = PyLong_FromLongLong(mod);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    PyTuple_SET_ITEM(ret, 1, tmp);
+    return ret;
+
+fail:
+    Py_XDECREF(ret);
+    Py_XDECREF(tmp);
+    return NULL;
+}
+
+
+static PyObject *
+extint_floordiv_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, c;
+    PY_LONG_LONG b;
+    if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+        return NULL;
+    }
+    if (b <= 0) {
+        PyErr_SetString(PyExc_ValueError, "");
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    c = floordiv_128_64(a, b);
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_ceildiv_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, c;
+    PY_LONG_LONG b;
+    if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+        return NULL;
+    }
+    if (b <= 0) {
+        PyErr_SetString(PyExc_ValueError, "");
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    c = ceildiv_128_64(a, b);
+    return pylong_from_int128(c);
+}
+
+
 static PyMethodDef Multiarray_TestsMethods[] = {
     {"IsPythonScalar",
         IsPythonScalar,
@@ -989,6 +1593,51 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"test_nditer_too_large",
         test_nditer_too_large,
         METH_VARARGS, NULL},
+    {"solve_diophantine",
+        (PyCFunction)array_solve_diophantine,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"internal_overlap",
+        (PyCFunction)array_internal_overlap,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"extint_safe_binop",
+        extint_safe_binop,
+        METH_VARARGS, NULL},
+    {"extint_to_128",
+        extint_to_128,
+        METH_VARARGS, NULL},
+    {"extint_to_64",
+        extint_to_64,
+        METH_VARARGS, NULL},
+    {"extint_mul_64_64",
+        extint_mul_64_64,
+        METH_VARARGS, NULL},
+    {"extint_add_128",
+        extint_add_128,
+        METH_VARARGS, NULL},
+    {"extint_sub_128",
+        extint_sub_128,
+        METH_VARARGS, NULL},
+    {"extint_neg_128",
+        extint_neg_128,
+        METH_VARARGS, NULL},
+    {"extint_shl_128",
+        extint_shl_128,
+        METH_VARARGS, NULL},
+    {"extint_shr_128",
+        extint_shr_128,
+        METH_VARARGS, NULL},
+    {"extint_gt_128",
+        extint_gt_128,
+        METH_VARARGS, NULL},
+    {"extint_divmod_128_64",
+        extint_divmod_128_64,
+        METH_VARARGS, NULL},
+    {"extint_floordiv_128_64",
+        extint_floordiv_128_64,
+        METH_VARARGS, NULL},
+    {"extint_ceildiv_128_64",
+        extint_ceildiv_128_64,
+        METH_VARARGS, NULL},
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 04513c56c..e72c355dc 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -60,6 +60,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "vdot.h"
 #include "templ_common.h" /* for npy_mul_with_overflow_intp */
 #include "compiled_base.h"
+#include "mem_overlap.h"
 
 /* Only here for API compatibility */
 NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
@@ -3993,30 +3994,88 @@ test_interrupt(PyObject *NPY_UNUSED(self), PyObject *args)
     return PyInt_FromLong(a);
 }
 
+
 static PyObject *
-array_may_share_memory(PyObject *NPY_UNUSED(ignored), PyObject *args)
+array_shares_memory(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
     PyArrayObject * self = NULL;
     PyArrayObject * other = NULL;
-    int overlap;
+    PyObject *max_work_obj = NULL;
+    static char *kwlist[] = {"self", "other", "max_work", NULL};
+
+    mem_overlap_t result;
+    static PyObject *too_hard_cls = NULL;
+    Py_ssize_t max_work = NPY_MAY_SHARE_EXACT;
+    NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTuple(args, "O&O&", PyArray_Converter, &self,
-                          PyArray_Converter, &other)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&O&|O", kwlist,
+                                     PyArray_Converter, &self,
+                                     PyArray_Converter, &other,
+                                     &max_work_obj)) {
         return NULL;
     }
 
-    overlap = arrays_overlap(self, other);
+    if (max_work_obj == NULL || max_work_obj == Py_None) {
+        /* noop */
+    }
+    else if (PyLong_Check(max_work_obj)) {
+        max_work = PyLong_AsSsize_t(max_work_obj);
+    }
+#if !defined(NPY_PY3K)
+    else if (PyInt_Check(max_work_obj)) {
+        max_work = PyInt_AsSsize_t(max_work_obj);
+    }
+#endif
+    else {
+        PyErr_SetString(PyExc_ValueError, "max_work must be an integer");
+        goto fail;
+    }
+
+    if (max_work < -2) {
+        PyErr_SetString(PyExc_ValueError, "Invalid value for max_work");
+        goto fail;
+    }
+
+    NPY_BEGIN_THREADS;
+    result = solve_may_share_memory(self, other, max_work);
+    NPY_END_THREADS;
+
     Py_XDECREF(self);
     Py_XDECREF(other);
 
-    if (overlap) {
+    if (result == MEM_OVERLAP_NO) {
+        Py_RETURN_FALSE;
+    }
+    else if (result == MEM_OVERLAP_YES) {
         Py_RETURN_TRUE;
     }
+    else if (result == MEM_OVERLAP_OVERFLOW) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Integer overflow in computing overlap");
+        return NULL;
+    }
+    else if (result == MEM_OVERLAP_TOO_HARD) {
+        npy_cache_import("numpy.core._internal", "TooHardError",
+                         &too_hard_cls);
+        if (too_hard_cls) {
+            PyErr_SetString(too_hard_cls, "Exceeded max_work");
+        }
+        return NULL;
+    }
     else {
-        Py_RETURN_FALSE;
+        /* Doesn't happen usually */
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Error in computing overlap");
+        return NULL;
     }
+
+fail:
+    Py_XDECREF(self);
+    Py_XDECREF(other);
+    return NULL;
 }
 
+
 static struct PyMethodDef array_module_methods[] = {
     {"_get_ndarray_c_version",
         (PyCFunction)array__get_ndarray_c_version,
@@ -4123,9 +4182,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"result_type",
         (PyCFunction)array_result_type,
         METH_VARARGS, NULL},
-    {"may_share_memory",
-        (PyCFunction)array_may_share_memory,
-        METH_VARARGS, NULL},
+    {"shares_memory",
+        (PyCFunction)array_shares_memory,
+        METH_VARARGS | METH_KEYWORDS, NULL},
     /* Datetime-related functions */
     {"datetime_data",
         (PyCFunction)array_datetime_data,
@@ -4583,6 +4642,9 @@ PyMODINIT_FUNC initmultiarray(void) {
     ADDCONST(RAISE);
     ADDCONST(WRAP);
     ADDCONST(MAXDIMS);
+
+    ADDCONST(MAY_SHARE_BOUNDS);
+    ADDCONST(MAY_SHARE_EXACT);
 #undef ADDCONST
 
     Py_INCREF(&PyArray_Type);
diff --git a/numpy/core/src/multiarray/multiarraymodule_onefile.c b/numpy/core/src/multiarray/multiarraymodule_onefile.c
index 3940d009b..3924f3cf4 100644
--- a/numpy/core/src/multiarray/multiarraymodule_onefile.c
+++ b/numpy/core/src/multiarray/multiarraymodule_onefile.c
@@ -53,6 +53,7 @@
 #include "ucsnarrow.c"
 #include "arrayobject.c"
 #include "numpymemoryview.c"
+#include "mem_overlap.c"
 #include "multiarraymodule.c"
 #include "compiled_base.c"
 
diff --git a/numpy/core/src/private/mem_overlap.c b/numpy/core/src/private/mem_overlap.c
new file mode 100644
index 000000000..3cab83497
--- /dev/null
+++ b/numpy/core/src/private/mem_overlap.c
@@ -0,0 +1,905 @@
+/*
+  Solving memory overlap integer programs and bounded Diophantine equations with
+  positive coefficients.
+
+  Asking whether two strided arrays `a` and `b` overlap is equivalent to
+  asking whether there is a solution to the following problem::
+
+      sum(stride_a[i] * x_a[i] for i in range(ndim_a))
+      -
+      sum(stride_b[i] * x_b[i] for i in range(ndim_b))
+      ==
+      base_b - base_a
+
+      0 <= x_a[i] < shape_a[i]
+      0 <= x_b[i] < shape_b[i]
+
+  for some integer x_a, x_b.  Itemsize needs to be considered as an additional
+  dimension with stride 1 and size itemsize.
+
+  Negative strides can be changed to positive (and vice versa) by changing
+  variables x[i] -> shape[i] - 1 - x[i], and zero strides can be dropped, so
+  that the problem can be recast into a bounded Diophantine equation with
+  positive coefficients::
+
+     sum(a[i] * x[i] for i in range(n)) == b
+
+     a[i] > 0
+
+     0 <= x[i] <= ub[i]
+
+  This problem is NP-hard --- runtime of algorithms grows exponentially with
+  increasing ndim.
+
+
+  *Algorithm description*
+
+  A straightforward algorithm that excludes infeasible solutions using GCD-based
+  pruning is outlined in Ref. [1]. It is implemented below. A number of other
+  algorithms exist in the literature; however, this one seems to have
+  performance satisfactory for the present purpose.
+
+  The idea is that an equation::
+
+      a_1 x_1 + a_2 x_2 + ... + a_n x_n = b
+      0 <= x_i <= ub_i, i = 1...n
+
+  implies::
+
+      a_2' x_2' + a_3 x_3 + ... + a_n x_n = b
+
+      0 <= x_i <= ub_i, i = 2...n
+
+      0 <= x_1' <= c_1 ub_1 + c_2 ub_2
+
+  with a_2' = gcd(a_1, a_2) and x_2' = c_1 x_1 + c_2 x_2 with c_1 = (a_1/a_1'),
+  and c_2 = (a_2/a_1').  This procedure can be repeated to obtain::
+
+      a_{n-1}' x_{n-1}' + a_n x_n = b
+
+      0 <= x_{n-1}' <= ub_{n-1}'
+
+      0 <= x_n <= ub_n
+
+  Now, one can enumerate all candidate solutions for x_n.  For each, one can use
+  the previous-level equation to enumerate potential solutions for x_{n-1}, with
+  transformed right-hand side b -> b - a_n x_n.  And so forth, until after n-1
+  nested for loops we either arrive at a candidate solution for x_1 (in which
+  case we have found one solution to the problem), or find that the equations do
+  not allow any solutions either for x_1 or one of the intermediate x_i (in
+  which case we have proved there is no solution for the upper-level candidates
+  chosen). If no solution is found for any candidate x_n, we have proved the
+  problem is infeasible --- which for the memory overlap problem means there is
+  no overlap.
+
+
+  *Performance*
+
+  Some common ndarray cases are easy for the algorithm:
+
+  - Two arrays whose memory ranges do not overlap.
+
+    These will be excluded by the bounds on x_n, with max_work=1. We also add
+    this check as a fast path, to avoid computing GCDs needlessly, as this can
+    take some time.
+
+  - Arrays produced by continuous slicing of a continuous parent array (no
+    internal overlap), e.g., a=x[:,0,:], b=x[:,1,:]. The strides taken together,
+    mapped positive, and duplicates then satisfy gcd(stride[0], .., stride[j]) =
+    stride[j] for some ordering.
+
+    In this case, for each x[i] at most one candidate exists, given that the
+    algorithm runs with strides sorted from largest to smallest. The problem can
+    be written as::
+
+       sum a_j x_j ?= b = sum a_j z_j
+
+       a_j = n_{j+1} * n_{j+2} * ... * n_d,  a_d = 1
+       0 <= x_j <= u_j <= 2*n_j - 2
+       0 <= z_j <= n_j - 1
+
+    b is the offset of the last element of the second array from the start of
+    the first.  z_j are uniquely determined because of the gcd property. For
+    each x_j, the bounds at first sight allow x_j=z_j and x_j=z_j+n_j. However,
+    u_j <= n_j - 1 + z_j, so that at most one candidate is left.
+
+  - Two arrays with stride-incommensurate starting points. For example,
+    a=x[:,::2], b=x[:,1::2].
+
+    The base address difference is incommensurate with all strides, so that
+    there are no solution candidates to consider. For itemsize != 1, similar
+    result is obtained for x_{n-1}.
+
+  The above cases cover arrays produced by typical slicing of well-behaved
+  parent arrays. More generally, more difficult cases can result::
+
+      x = np.arange(4*20).reshape(4, 20).astype(np.int8)
+      a = x[:,::7]
+      b = x[:,3::3]
+
+      <=>
+
+      20*x1 + 7*x2 + 3*x3 = 78    (= 3 + 3*20 + 5*3)
+      0 <= x1 <= 6, 0 <= x2 <= 2, 0 <= x3 <= 5
+
+  Non-overlapping in this case relies on x.shape[1] <= lcm(7, 3) = 21.  However,
+  elimination of x1 does not restrict candidate values for x3, so the algorithm
+  ends up considering all values x3=0...5 separately.
+
+  The upper bound for work done is prod(shape_a)*prod(shape_b), which scales
+  faster than than work done by binary ufuncs, after broadcasting,
+  prod(shape_a). The bound may be loose, but it is possible to construct hard
+  instances where ufunc is faster (adapted from [2,3])::
+
+      from numpy.lib.stride_tricks import as_strided
+      # Construct non-overlapping x1 and x2
+      x = np.zeros([192163377], dtype=np.int8)
+      x1 = as_strided(x, strides=(36674, 61119, 85569), shape=(1049, 1049, 1049))
+      x2 = as_strided(x[64023025:], strides=(12223, 12224, 1), shape=(1049, 1049, 1))
+
+  To avoid such worst cases, the amount of work done needs to be capped. If the
+  overlap problem is related to ufuncs, one suitable cap choice is to scale
+  max_work with the number of elements of the array. (Ref. [3] describes a more
+  efficient algorithm for solving problems similar to the above --- however,
+  also it must scale exponentially.)
+
+
+  *Integer overflows*
+
+  The algorithm is written in fixed-width integers, and can terminate with
+  failure if integer overflow is detected (the implementation catches all
+  cases). Potential failure modes:
+
+  - Array extent sum(stride*(shape-1)) is too large (for int64).
+
+  - Minimal solutions to a_i x_i + a_j x_j == b are too large,
+    in some of the intermediate equations.
+
+    We do this part of the computation in 128-bit integers.
+
+  In general, overflows are expected only if array size is close to
+  NPY_INT64_MAX, requiring ~exabyte size arrays, which is usually not possible.
+
+  References
+  ----------
+  .. [1] P. Ramachandran, ''Use of Extended Euclidean Algorithm in Solving
+         a System of Linear Diophantine Equations with Bounded Variables''.
+         Algorithmic Number Theory, Lecture Notes in Computer Science **4076**,
+         182-192 (2006). doi:10.1007/11792086_14
+
+  .. [2] Cornuejols, Urbaniak, Weismantel, and Wolsey,
+         ''Decomposition of integer programs and of generating sets.'',
+         Lecture Notes in Computer Science 1284, 92-103 (1997).
+
+  .. [3] K. Aardal, A.K. Lenstra,
+         ''Hard equality constrained integer knapsacks'',
+         Lecture Notes in Computer Science 2337, 350-366 (2002).
+*/  
+
+/*
+  Copyright (c) 2015 Pauli Virtanen
+  All rights reserved.
+  Licensed under 3-clause BSD license, see LICENSE.txt.
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "numpy/ndarraytypes.h"
+#include "mem_overlap.h"
+#include "npy_extint128.h"
+
+
+#define MAX(a, b) (((a) >= (b)) ? (a) : (b))
+#define MIN(a, b) (((a) <= (b)) ? (a) : (b))
+
+
+/**
+ * Euclid's algorithm for GCD.
+ *
+ * Solves for gamma*a1 + epsilon*a2 == gcd(a1, a2)
+ * providing |gamma| < |a2|/gcd, |epsilon| < |a1|/gcd.
+ */
+static void
+euclid(npy_int64 a1, npy_int64 a2, npy_int64 *a_gcd, npy_int64 *gamma, npy_int64 *epsilon)
+{
+    npy_int64 gamma1, gamma2, epsilon1, epsilon2, r;
+
+    assert(a1 > 0);
+    assert(a2 > 0);
+
+    gamma1 = 1;
+    gamma2 = 0;
+    epsilon1 = 0;
+    epsilon2 = 1;
+
+    /* The numbers remain bounded by |a1|, |a2| during
+       the iteration, so no integer overflows */
+    while (1) {
+        if (a2 > 0) {
+            r = a1/a2;
+            a1 -= r*a2;
+            gamma1 -= r*gamma2;
+            epsilon1 -= r*epsilon2;
+        }
+        else {
+            *a_gcd = a1;
+            *gamma = gamma1;
+            *epsilon = epsilon1;
+            break;
+        }
+
+        if (a1 > 0) {
+            r = a2/a1;
+            a2 -= r*a1;
+            gamma2 -= r*gamma1;
+            epsilon2 -= r*epsilon1;
+        }
+        else {
+            *a_gcd = a2;
+            *gamma = gamma2;
+            *epsilon = epsilon2;
+            break;
+        }
+    }
+}
+
+
+/**
+ * Precompute GCD and bounds transformations
+ */
+static int
+diophantine_precompute(unsigned int n,
+                       diophantine_term_t *E,
+                       diophantine_term_t *Ep,
+                       npy_int64 *Gamma, npy_int64 *Epsilon)
+{
+    npy_int64 a_gcd, gamma, epsilon, c1, c2;
+    unsigned int j;
+    char overflow = 0;
+
+    assert(n >= 2);
+
+    euclid(E[0].a, E[1].a, &a_gcd, &gamma, &epsilon);
+    Ep[0].a = a_gcd;
+    Gamma[0] = gamma;
+    Epsilon[0] = epsilon;
+
+    if (n > 2) {
+        c1 = E[0].a / a_gcd;
+        c2 = E[1].a / a_gcd;
+
+        /* Ep[0].ub = E[0].ub * c1 + E[1].ub * c2; */
+        Ep[0].ub = safe_add(safe_mul(E[0].ub, c1, &overflow),
+                            safe_mul(E[1].ub, c2, &overflow), &overflow);
+        if (overflow) {
+            return 1;
+        }
+    }
+
+    for (j = 2; j < n; ++j) {
+        euclid(Ep[j-2].a, E[j].a, &a_gcd, &gamma, &epsilon);
+        Ep[j-1].a = a_gcd;
+        Gamma[j-1] = gamma;
+        Epsilon[j-1] = epsilon;
+
+        if (j < n - 1) {
+            c1 = Ep[j-2].a / a_gcd;
+            c2 = E[j].a / a_gcd;
+
+            /* Ep[j-1].ub = c1 * Ep[j-2].ub + c2 * E[j].ub; */
+            Ep[j-1].ub = safe_add(safe_mul(c1, Ep[j-2].ub, &overflow),
+                                  safe_mul(c2, E[j].ub, &overflow), &overflow);
+
+            if (overflow) {
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+/**
+ * Depth-first bounded Euclid search
+ */
+static mem_overlap_t
+diophantine_dfs(unsigned int n,
+                unsigned int v,
+                diophantine_term_t *E,
+                diophantine_term_t *Ep,
+                npy_int64 *Gamma, npy_int64 *Epsilon,
+                npy_int64 b,
+                Py_ssize_t max_work,
+                int require_ub_nontrivial,
+                npy_int64 *x,
+                Py_ssize_t *count)
+{
+    npy_int64 a_gcd, gamma, epsilon, a1, u1, a2, u2, c, r, c1, c2, t, t_l, t_u, b2, x1, x2;
+    npy_extint128_t x10, x20, t_l1, t_l2, t_u1, t_u2;
+    mem_overlap_t res;
+    char overflow = 0;
+
+    if (max_work >= 0 && *count >= max_work) {
+        return MEM_OVERLAP_TOO_HARD;
+    }
+
+    /* Fetch precomputed values for the reduced problem */
+    if (v == 1) {
+        a1 = E[0].a;
+        u1 = E[0].ub;
+    }
+    else {
+        a1 = Ep[v-2].a;
+        u1 = Ep[v-2].ub;
+    }
+
+    a2 = E[v].a;
+    u2 = E[v].ub;
+
+    a_gcd = Ep[v-1].a;
+    gamma = Gamma[v-1];
+    epsilon = Epsilon[v-1];
+
+    /* Generate set of allowed solutions */
+    c = b / a_gcd;
+    r = b % a_gcd;
+    if (r != 0) {
+        ++*count;
+        return MEM_OVERLAP_NO;
+    }
+
+    c1 = a2 / a_gcd;
+    c2 = a1 / a_gcd;
+
+    /*
+      The set to enumerate is:
+      x1 = gamma*c + c1*t
+      x2 = epsilon*c - c2*t
+      t integer
+      0 <= x1 <= u1
+      0 <= x2 <= u2
+      and we have c, c1, c2 >= 0
+     */
+
+    x10 = mul_64_64(gamma, c);
+    x20 = mul_64_64(epsilon, c);
+
+    t_l1 = ceildiv_128_64(neg_128(x10), c1);
+    t_l2 = ceildiv_128_64(sub_128(x20, to_128(u2), &overflow), c2);
+
+    t_u1 = floordiv_128_64(sub_128(to_128(u1), x10, &overflow), c1);
+    t_u2 = floordiv_128_64(x20, c2);
+
+    if (overflow) {
+        return MEM_OVERLAP_OVERFLOW;
+    }
+
+    if (gt_128(t_l2, t_l1)) {
+        t_l1 = t_l2;
+    }
+
+    if (gt_128(t_u1, t_u2)) {
+        t_u1 = t_u2;
+    }
+
+    if (gt_128(t_l1, t_u1)) {
+        ++*count;
+        return MEM_OVERLAP_NO;
+    }
+
+    t_l = to_64(t_l1, &overflow);
+    t_u = to_64(t_u1, &overflow);
+
+    x10 = add_128(x10, mul_64_64(c1, t_l), &overflow);
+    x20 = sub_128(x20, mul_64_64(c2, t_l), &overflow);
+
+    t_u = safe_sub(t_u, t_l, &overflow);
+    t_l = 0;
+    x1 = to_64(x10, &overflow);
+    x2 = to_64(x20, &overflow);
+
+    if (overflow) {
+        return MEM_OVERLAP_OVERFLOW;
+    }
+
+    /* The bounds t_l, t_u ensure the x computed below do not overflow */
+
+    if (v == 1) {
+        /* Base case */
+        if (t_u >= t_l) {
+            x[0] = x1 + c1*t_l;
+            x[1] = x2 - c2*t_l;
+            if (require_ub_nontrivial) {
+                int j, is_ub_trivial;
+
+                is_ub_trivial = 1;
+                for (j = 0; j < n; ++j) {
+                    if (x[j] != E[j].ub/2) {
+                        is_ub_trivial = 0;
+                        break;
+                    }
+                }
+
+                if (is_ub_trivial) {
+                    /* Ignore 'trivial' solution */
+                    ++*count;
+                    return MEM_OVERLAP_NO;
+                }
+            }
+            return MEM_OVERLAP_YES;
+        }
+        ++*count;
+        return MEM_OVERLAP_NO;
+    }
+    else {
+        /* Recurse to all candidates */
+        for (t = t_l; t <= t_u; ++t) {
+            x[v] = x2 - c2*t;
+
+            /* b2 = b - a2*x[v]; */
+            b2 = safe_sub(b, safe_mul(a2, x[v], &overflow), &overflow);
+            if (overflow) {
+                return MEM_OVERLAP_OVERFLOW;
+            }
+
+            res = diophantine_dfs(n, v-1, E, Ep, Gamma, Epsilon,
+                                  b2, max_work, require_ub_nontrivial,
+                                  x, count);
+            if (res != MEM_OVERLAP_NO) {
+                return res;
+            }
+        }
+        ++*count;
+        return MEM_OVERLAP_NO;
+    }
+}
+
+
+/**
+ * Solve bounded Diophantine equation
+ *
+ * The problem considered is::
+ *
+ *     A[0] x[0] + A[1] x[1] + ... + A[n-1] x[n-1] == b
+ *     0 <= x[i] <= U[i]
+ *     A[i] > 0
+ *
+ * Solve via depth-first Euclid's algorithm, as explained in [1].
+ *
+ * If require_ub_nontrivial!=0, look for solutions to the problem
+ * where b = A[0]*(U[0]/2) + ... + A[n]*(U[n-1]/2) but ignoring
+ * the trivial solution x[i] = U[i]/2. All U[i] must be divisible by 2.
+ * The value given for `b` is ignored in this case.
+ */
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_diophantine(unsigned int n, diophantine_term_t *E, npy_int64 b,
+                  Py_ssize_t max_work, int require_ub_nontrivial, npy_int64 *x)
+{
+    unsigned int j;
+
+    for (j = 0; j < n; ++j) {
+        if (E[j].a <= 0) {
+            return MEM_OVERLAP_ERROR;
+        }
+        else if (E[j].ub < 0) {
+            return MEM_OVERLAP_NO;
+        }
+    }
+
+    if (require_ub_nontrivial) {
+        npy_int64 ub_sum = 0;
+        char overflow = 0;
+        for (j = 0; j < n; ++j) {
+            if (E[j].ub % 2 != 0) {
+                return MEM_OVERLAP_ERROR;
+            }
+            ub_sum = safe_add(ub_sum,
+                              safe_mul(E[j].a, E[j].ub/2, &overflow),
+                              &overflow);
+        }
+        if (overflow) {
+            return MEM_OVERLAP_ERROR;
+        }
+        b = ub_sum;
+    }
+
+    if (b < 0) {
+        return MEM_OVERLAP_NO;
+    }
+
+    if (n == 0) {
+        if (require_ub_nontrivial) {
+            /* Only trivial solution for 0-variable problem */
+            return MEM_OVERLAP_NO;
+        }
+        if (b == 0) {
+            return MEM_OVERLAP_YES;
+        }
+        return MEM_OVERLAP_NO;
+    }
+    else if (n == 1) {
+        if (require_ub_nontrivial) {
+            /* Only trivial solution for 1-variable problem */
+            return MEM_OVERLAP_NO;
+        }
+        if (b % E[0].a == 0) {
+            x[0] = b / E[0].a;
+            if (x[0] >= 0 && x[0] <= E[0].ub) {
+                return MEM_OVERLAP_YES;
+            }
+        }
+        return MEM_OVERLAP_NO;
+    }
+    else {
+        diophantine_term_t Ep[n];
+        npy_int64 Epsilon[n], Gamma[n];
+        Py_ssize_t count = 0;
+
+        if (diophantine_precompute(n, E, Ep, Gamma, Epsilon)) {
+            return MEM_OVERLAP_OVERFLOW;
+        }
+        return diophantine_dfs(n, n-1, E, Ep, Gamma, Epsilon, b, max_work,
+                               require_ub_nontrivial, x, &count);
+    }
+}
+
+
+static int
+diophantine_sort_A(const void *xp, const void *yp)
+{
+    npy_int64 xa = ((diophantine_term_t*)xp)->a;
+    npy_int64 ya = ((diophantine_term_t*)yp)->a;
+
+    if (xa < ya) {
+        return 1;
+    }
+    else if (ya < xa) {
+        return -1;
+    }
+    else {
+        return 0;
+    }
+}
+
+
+/**
+ * Simplify Diophantine decision problem.
+ *
+ * Combine identical coefficients, remove unnecessary variables, and trim
+ * bounds.
+ *
+ * The feasible/infeasible decision result is retained.
+ *
+ * Returns: 0 (success), -1 (integer overflow).
+ */
+NPY_VISIBILITY_HIDDEN int
+diophantine_simplify(unsigned int *n, diophantine_term_t *E, npy_int64 b)
+{
+    unsigned int i, j, m;
+    char overflow = 0;
+
+    /* Skip obviously infeasible cases */
+    for (j = 0; j < *n; ++j) {
+        if (E[j].ub < 0) {
+            return 0;
+        }
+    }
+
+    if (b < 0) {
+        return 0;
+    }
+
+    /* Sort vs. coefficients */
+    qsort(E, *n, sizeof(diophantine_term_t), diophantine_sort_A);
+
+    /* Combine identical coefficients */
+    m = *n;
+    i = 0;
+    for (j = 1; j < m; ++j) {
+        if (E[i].a == E[j].a) {
+            E[i].ub = safe_add(E[i].ub, E[j].ub, &overflow);
+            --*n;
+        }
+        else {
+            ++i;
+            if (i != j) {
+                E[i] = E[j];
+            }
+        }
+    }
+
+    /* Trim bounds and remove unnecessary variables */
+    m = *n;
+    i = 0;
+    for (j = 0; j < m; ++j) {
+        E[j].ub = MIN(E[j].ub, b / E[j].a);
+        if (E[j].ub == 0) {
+            /* If the problem is feasible at all, x[i]=0 */
+            --*n;
+        }
+        else {
+            if (i != j) {
+                E[i] = E[j];
+            }
+            ++i;
+        }
+    }
+
+    if (overflow) {
+        return -1;
+    }
+    else {
+        return 0;
+    }
+}
+
+
+/* Gets a half-open range [start, end) of offsets from the data pointer */
+NPY_VISIBILITY_HIDDEN void
+offset_bounds_from_strides(const int itemsize, const int nd,
+                           const npy_intp *dims, const npy_intp *strides,
+                           npy_intp *lower_offset, npy_intp *upper_offset)
+{
+    npy_intp max_axis_offset;
+    npy_intp lower = 0;
+    npy_intp upper = 0;
+    int i;
+
+    for (i = 0; i < nd; i++) {
+        if (dims[i] == 0) {
+            /* If the array size is zero, return an empty range */
+            *lower_offset = 0;
+            *upper_offset = 0;
+            return;
+        }
+        /* Expand either upwards or downwards depending on stride */
+        max_axis_offset = strides[i] * (dims[i] - 1);
+        if (max_axis_offset > 0) {
+            upper += max_axis_offset;
+        }
+        else {
+            lower += max_axis_offset;
+        }
+    }
+    /* Return a half-open range */
+    upper += itemsize;
+    *lower_offset = lower;
+    *upper_offset = upper;
+}
+
+
+/* Gets a half-open range [start, end) which contains the array data */
+static void
+get_array_memory_extents(PyArrayObject *arr,
+                         npy_uintp *out_start, npy_uintp *out_end,
+                         npy_uintp *num_bytes)
+{
+    npy_intp low, upper;
+    int j;
+    offset_bounds_from_strides(PyArray_ITEMSIZE(arr), PyArray_NDIM(arr),
+                               PyArray_DIMS(arr), PyArray_STRIDES(arr),
+                               &low, &upper);
+    *out_start = (npy_uintp)PyArray_DATA(arr) + (npy_uintp)low;
+    *out_end = (npy_uintp)PyArray_DATA(arr) + (npy_uintp)upper;
+
+    *num_bytes = PyArray_ITEMSIZE(arr);
+    for (j = 0; j < PyArray_NDIM(arr); ++j) {
+        *num_bytes *= PyArray_DIM(arr, j);
+    }
+}
+
+
+static int
+strides_to_terms(PyArrayObject *arr, diophantine_term_t *terms,
+                 unsigned int *nterms, int skip_empty)
+{
+    unsigned int i;
+
+    for (i = 0; i < PyArray_NDIM(arr); ++i) {
+        if (skip_empty) {
+            if (PyArray_DIM(arr, i) <= 1 || PyArray_STRIDE(arr, i) == 0) {
+                continue;
+            }
+        }
+
+        terms[*nterms].a = PyArray_STRIDE(arr, i);
+
+        if (terms[*nterms].a < 0) {
+            terms[*nterms].a = -terms[*nterms].a;
+        }
+
+        if (terms[*nterms].a < 0) {
+            /* integer overflow */
+            return 1;
+        }
+
+        terms[*nterms].ub = PyArray_DIM(arr, i) - 1;
+        ++*nterms;
+    }
+
+    return 0;
+}
+
+
+/**
+ * Determine whether two arrays share some memory.
+ *
+ * Returns: 0 (no shared memory), 1 (shared memory), or < 0 (failed to solve).
+ *
+ * Note that failures to solve can occur due to integer overflows, or effort
+ * required solving the problem exceeding max_work.  The general problem is
+ * NP-hard and worst case runtime is exponential in the number of dimensions.
+ * max_work controls the amount of work done, either exact (max_work == -1), only
+ * a simple memory extent check (max_work == 0), or set an upper bound
+ * max_work > 0 for the number of solution candidates considered.
+ */
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_may_share_memory(PyArrayObject *a, PyArrayObject *b,
+                       Py_ssize_t max_work)
+{
+    npy_int64 rhs;
+    diophantine_term_t terms[2*NPY_MAXDIMS+2];
+    npy_uintp start1 = 0, start2 = 0, end1 = 0, end2 = 0, size1 = 0, size2 = 0;
+    npy_int64 x[2*NPY_MAXDIMS+2];
+    unsigned int nterms;
+
+    get_array_memory_extents(a, &start1, &end1, &size1);
+    get_array_memory_extents(b, &start2, &end2, &size2);
+
+    if (!(start1 < end2 && start2 < end1 && start1 < end1 && start2 < end2)) {
+        /* Memory extents don't overlap */
+        return MEM_OVERLAP_NO;
+    }
+
+    if (max_work == 0) {
+        /* Too much work required, give up */
+        return MEM_OVERLAP_TOO_HARD;
+    }
+
+    /* Convert problem to Diophantine equation form with positive coefficients.
+       The bounds computed by offset_bounds_from_strides correspond to
+       all-positive strides.
+
+       start1 + sum(abs(stride1)*x1)
+       == start2 + sum(abs(stride2)*x2)
+       == end1 - 1 - sum(abs(stride1)*x1')
+       == end2 - 1 - sum(abs(stride2)*x2')
+
+       <=>
+
+       sum(abs(stride1)*x1) + sum(abs(stride2)*x2')
+       == end2 - 1 - start1
+
+       OR
+
+       sum(abs(stride1)*x1') + sum(abs(stride2)*x2)
+       == end1 - 1 - start2
+
+       We pick the problem with the smaller RHS (they are non-negative due to
+       the extent check above.)
+    */
+
+    rhs = MIN(end2 - 1 - start1, end1 - 1 - start2);
+
+    if (rhs != (npy_uintp)rhs) {
+        /* Integer overflow */
+        return MEM_OVERLAP_OVERFLOW;
+    }
+
+    nterms = 0;
+    if (strides_to_terms(a, terms, &nterms, 1)) {
+        return MEM_OVERLAP_OVERFLOW;
+    }
+    if (strides_to_terms(b, terms, &nterms, 1)) {
+        return MEM_OVERLAP_OVERFLOW;
+    }
+    if (PyArray_ITEMSIZE(a) > 1) {
+        terms[nterms].a = 1;
+        terms[nterms].ub = PyArray_ITEMSIZE(a) - 1;
+        ++nterms;
+    }
+    if (PyArray_ITEMSIZE(b) > 1) {
+        terms[nterms].a = 1;
+        terms[nterms].ub = PyArray_ITEMSIZE(b) - 1;
+        ++nterms;
+    }
+
+    /* Simplify, if possible */
+    if (diophantine_simplify(&nterms, terms, rhs)) {
+        /* Integer overflow */
+        return MEM_OVERLAP_OVERFLOW;
+    }
+
+    /* Solve */
+    return solve_diophantine(nterms, terms, rhs, max_work, 0, x);
+}
+
+
+/**
+ * Determine whether an array has internal overlap.
+ *
+ * Returns: 0 (no overlap), 1 (overlap), or < 0 (failed to solve).
+ *
+ * max_work and reasons for solver failures are as in solve_may_share_memory.
+ */
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_may_have_internal_overlap(PyArrayObject *a, Py_ssize_t max_work)
+{
+    diophantine_term_t terms[NPY_MAXDIMS+1];
+    npy_int64 x[NPY_MAXDIMS+1];
+    unsigned int nterms;
+    int i, j;
+
+    if (PyArray_ISCONTIGUOUS(a)) {
+        /* Quick case */
+        return MEM_OVERLAP_NO;
+    }
+
+    /* The internal memory overlap problem is looking for two different
+       solutions to
+
+           sum(a*x) = b,   0 <= x[i] <= ub[i]
+
+       for any b. Equivalently,
+
+           sum(a*x0) - sum(a*x1) = 0
+
+       Mapping the coefficients on the left by x0'[i] = x0[i] if a[i] > 0
+       else ub[i]-x0[i] and opposite for x1, we have
+
+           sum(abs(a)*(x0' + x1')) = sum(abs(a)*ub)
+
+       Now, x0!=x1 if for some i we have x0'[i] + x1'[i] != ub[i].
+       We can now change variables to z[i] = x0'[i] + x1'[i] so the problem
+       becomes
+
+           sum(abs(a)*z) = sum(abs(a)*ub),   0 <= z[i] <= 2*ub[i],   z != ub
+
+       This can be solved with solve_diophantine.
+    */
+
+    nterms = 0;
+    if (strides_to_terms(a, terms, &nterms, 0)) {
+        return MEM_OVERLAP_OVERFLOW;
+    }
+    if (PyArray_ITEMSIZE(a) > 1) {
+        terms[nterms].a = 1;
+        terms[nterms].ub = PyArray_ITEMSIZE(a) - 1;
+        ++nterms;
+    }
+
+    /* Get rid of zero coefficients and empty terms */
+    i = 0;
+    for (j = 0; j < nterms; ++j) {
+        if (terms[j].ub == 0) {
+            continue;
+        }
+        else if (terms[j].ub < 0) {
+            return MEM_OVERLAP_NO;
+        }
+        else if (terms[j].a == 0) {
+            return MEM_OVERLAP_YES;
+        }
+        if (i != j) {
+            terms[i] = terms[j];
+        }
+        ++i;
+    }
+    nterms = i;
+
+    /* Double bounds to get the internal overlap problem */
+    for (j = 0; j < nterms; ++j) {
+        terms[j].ub *= 2;
+    }
+
+    /* Sort vs. coefficients; cannot call diophantine_simplify because it may
+       change the decision problem inequality part */
+    qsort(terms, nterms, sizeof(diophantine_term_t), diophantine_sort_A);
+
+    /* Solve */
+    return solve_diophantine(nterms, terms, -1, max_work, 1, x);
+}
diff --git a/numpy/core/src/private/mem_overlap.h b/numpy/core/src/private/mem_overlap.h
new file mode 100644
index 000000000..8044f1663
--- /dev/null
+++ b/numpy/core/src/private/mem_overlap.h
@@ -0,0 +1,50 @@
+#ifndef MEM_OVERLAP_H_
+#define MEM_OVERLAP_H_
+
+#include "npy_config.h"
+#include "numpy/ndarraytypes.h"
+
+
+/* Bounds check only */
+#define NPY_MAY_SHARE_BOUNDS 0
+
+/* Exact solution */
+#define NPY_MAY_SHARE_EXACT -1
+
+
+typedef enum {
+    MEM_OVERLAP_NO = 0,        /* no solution exists */
+    MEM_OVERLAP_YES = 1,       /* solution found */
+    MEM_OVERLAP_TOO_HARD = -1, /* max_work exceeded */
+    MEM_OVERLAP_OVERFLOW = -2, /* algorithm failed due to integer overflow */
+    MEM_OVERLAP_ERROR = -3     /* invalid input */
+} mem_overlap_t;
+
+
+typedef struct {
+    npy_int64 a;
+    npy_int64 ub;
+} diophantine_term_t;
+
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_diophantine(unsigned int n, diophantine_term_t *E,
+                  npy_int64 b, Py_ssize_t max_work, int require_nontrivial,
+                  npy_int64 *x);
+
+NPY_VISIBILITY_HIDDEN int
+diophantine_simplify(unsigned int *n, diophantine_term_t *E, npy_int64 b);
+
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_may_share_memory(PyArrayObject *a, PyArrayObject *b,
+                       Py_ssize_t max_work);
+
+NPY_VISIBILITY_HIDDEN mem_overlap_t
+solve_may_have_internal_overlap(PyArrayObject *a, Py_ssize_t max_work);
+
+NPY_VISIBILITY_HIDDEN void
+offset_bounds_from_strides(const int itemsize, const int nd,
+                           const npy_intp *dims, const npy_intp *strides,
+                           npy_intp *lower_offset, npy_intp *upper_offset);
+
+#endif
+
diff --git a/numpy/core/src/private/npy_extint128.h b/numpy/core/src/private/npy_extint128.h
new file mode 100644
index 000000000..6a35e736f
--- /dev/null
+++ b/numpy/core/src/private/npy_extint128.h
@@ -0,0 +1,317 @@
+#ifndef NPY_EXTINT128_H_
+#define NPY_EXTINT128_H_
+
+
+typedef struct {
+    char sign;
+    npy_uint64 lo, hi;
+} npy_extint128_t;
+
+
+/* Integer addition with overflow checking */
+static NPY_INLINE npy_int64
+safe_add(npy_int64 a, npy_int64 b, char *overflow_flag)
+{
+    if (a > 0 && b > NPY_MAX_INT64 - a) {
+        *overflow_flag = 1;
+    }
+    else if (a < 0 && b < NPY_MIN_INT64 - a) {
+        *overflow_flag = 1;
+    }
+    return a + b;
+}
+
+
+/* Integer subtraction with overflow checking */
+static NPY_INLINE npy_int64
+safe_sub(npy_int64 a, npy_int64 b, char *overflow_flag)
+{
+    if (a >= 0 && b < a - NPY_MAX_INT64) {
+        *overflow_flag = 1;
+    }
+    else if (a < 0 && b > a - NPY_MIN_INT64) {
+        *overflow_flag = 1;
+    }
+    return a - b;
+}
+
+
+/* Integer multiplication with overflow checking */
+static NPY_INLINE npy_int64
+safe_mul(npy_int64 a, npy_int64 b, char *overflow_flag)
+{
+    if (a > 0) {
+        if (b > NPY_MAX_INT64 / a || b < NPY_MIN_INT64 / a) {
+            *overflow_flag = 1;
+        }
+    }
+    else if (a < 0) {
+        if (b > 0 && a < NPY_MIN_INT64 / b) {
+            *overflow_flag = 1;
+        }
+        else if (b < 0 && a < NPY_MAX_INT64 / b) {
+            *overflow_flag = 1;
+        }
+    }
+    return a * b;
+}
+
+
+/* Long integer init */
+static NPY_INLINE npy_extint128_t
+to_128(npy_int64 x)
+{
+    npy_extint128_t result;
+    result.sign = (x >= 0 ? 1 : -1);
+    if (x >= 0) {
+        result.lo = x;
+    }
+    else {
+        result.lo = (npy_uint64)(-(x + 1)) + 1;
+    }
+    result.hi = 0;
+    return result;
+}
+
+
+static NPY_INLINE npy_int64
+to_64(npy_extint128_t x, char *overflow)
+{
+    if (x.hi != 0 ||
+        (x.sign > 0 && x.lo > NPY_MAX_INT64) ||
+        (x.sign < 0 && x.lo != 0 && x.lo - 1 > -(NPY_MIN_INT64 + 1))) {
+        *overflow = 1;
+    }
+    return x.lo * x.sign;
+}
+
+
+/* Long integer multiply */
+static NPY_INLINE npy_extint128_t
+mul_64_64(npy_int64 a, npy_int64 b)
+{
+    npy_extint128_t x, y, z;
+    npy_uint64 x1, x2, y1, y2, r1, r2, prev;
+
+    x = to_128(a);
+    y = to_128(b);
+
+    x1 = x.lo & 0xffffffff;
+    x2 = x.lo >> 32;
+
+    y1 = y.lo & 0xffffffff;
+    y2 = y.lo >> 32;
+
+    r1 = x1*y2;
+    r2 = x2*y1;
+
+    z.sign = x.sign * y.sign;
+    z.hi = x2*y2 + (r1 >> 32) + (r2 >> 32);
+    z.lo = x1*y1;
+
+    /* Add with carry */
+    prev = z.lo;
+    z.lo += (r1 << 32);
+    if (z.lo < prev) {
+        ++z.hi;
+    }
+
+    prev = z.lo;
+    z.lo += (r2 << 32);
+    if (z.lo < prev) {
+        ++z.hi;
+    }
+
+    return z;
+}
+
+
+/* Long integer add */
+static NPY_INLINE npy_extint128_t
+add_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
+{
+    npy_extint128_t z;
+
+    if (x.sign == y.sign) {
+        z.sign = x.sign;
+        z.hi = x.hi + y.hi;
+        if (z.hi < x.hi) {
+            *overflow = 1;
+        }
+        z.lo = x.lo + y.lo;
+        if (z.lo < x.lo) {
+            if (z.hi == NPY_MAX_UINT64) {
+                *overflow = 1;
+            }
+            ++z.hi;
+        }
+    }
+    else if (x.hi > y.hi || (x.hi == y.hi && x.lo >= y.lo)) {
+        z.sign = x.sign;
+        z.hi = x.hi - y.hi;
+        z.lo = x.lo;
+        z.lo -= y.lo;
+        if (z.lo > x.lo) {
+            --z.hi;
+        }
+    }
+    else {
+        z.sign = y.sign;
+        z.hi = y.hi - x.hi;
+        z.lo = y.lo;
+        z.lo -= x.lo;
+        if (z.lo > y.lo) {
+            --z.hi;
+        }
+    }
+
+    return z;
+}
+
+
+/* Long integer negation */
+static NPY_INLINE npy_extint128_t
+neg_128(npy_extint128_t x)
+{
+    npy_extint128_t z = x;
+    z.sign *= -1;
+    return z;
+}
+
+
+static NPY_INLINE npy_extint128_t
+sub_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
+{
+    return add_128(x, neg_128(y), overflow);
+}
+
+
+static NPY_INLINE npy_extint128_t
+shl_128(npy_extint128_t v)
+{
+    npy_extint128_t z;
+    z = v;
+    z.hi <<= 1;
+    z.hi |= (z.lo & (((npy_uint64)1) << 63)) >> 63;
+    z.lo <<= 1;
+    return z;
+}
+
+
+static NPY_INLINE npy_extint128_t
+shr_128(npy_extint128_t v)
+{
+    npy_extint128_t z;
+    z = v;
+    z.lo >>= 1;
+    z.lo |= (z.hi & 0x1) << 63;
+    z.hi >>= 1;
+    return z;
+}
+
+static NPY_INLINE int
+gt_128(npy_extint128_t a, npy_extint128_t b)
+{
+    if (a.sign > 0 && b.sign > 0) {
+        return (a.hi > b.hi) || (a.hi == b.hi && a.lo > b.lo);
+    }
+    else if (a.sign < 0 && b.sign < 0) {
+        return (a.hi < b.hi) || (a.hi == b.hi && a.lo < b.lo);
+    }
+    else if (a.sign > 0 && b.sign < 0) {
+        return a.hi != 0 || a.lo != 0 || b.hi != 0 || b.lo != 0;
+    }
+    else {
+        return 0;
+    }
+}
+
+
+/* Long integer divide */
+static NPY_INLINE npy_extint128_t
+divmod_128_64(npy_extint128_t x, npy_int64 b, npy_int64 *mod)
+{
+    npy_extint128_t remainder, pointer, result, divisor;
+    char overflow = 0;
+
+    assert(b > 0);
+
+    if (b <= 1 || x.hi == 0) {
+        result.sign = x.sign;
+        result.lo = x.lo / b;
+        result.hi = x.hi / b;
+        *mod = x.sign * (x.lo % b);
+        return result;
+    }
+
+    /* Long division, not the most efficient choice */
+    remainder = x;
+    remainder.sign = 1;
+
+    divisor.sign = 1;
+    divisor.hi = 0;
+    divisor.lo = b;
+
+    result.sign = 1;
+    result.lo = 0;
+    result.hi = 0;
+
+    pointer.sign = 1;
+    pointer.lo = 1;
+    pointer.hi = 0;
+
+    while ((divisor.hi & (((npy_uint64)1) << 63)) == 0 &&
+           gt_128(remainder, divisor)) {
+        divisor = shl_128(divisor);
+        pointer = shl_128(pointer);
+    }
+
+    while (pointer.lo || pointer.hi) {
+        if (!gt_128(divisor, remainder)) {
+            remainder = sub_128(remainder, divisor, &overflow);
+            result = add_128(result, pointer, &overflow);
+        }
+        divisor = shr_128(divisor);
+        pointer = shr_128(pointer);
+    }
+
+    /* Fix signs and return; cannot overflow */
+    result.sign = x.sign;
+    *mod = x.sign * remainder.lo;
+
+    return result;
+}
+
+
+/* Divide and round down (positive divisor; no overflows) */
+static NPY_INLINE npy_extint128_t
+floordiv_128_64(npy_extint128_t a, npy_int64 b)
+{
+    npy_extint128_t result;
+    npy_int64 remainder;
+    char overflow = 0;
+    assert(b > 0);
+    result = divmod_128_64(a, b, &remainder);
+    if (a.sign < 0 && remainder != 0) {
+        result = sub_128(result, to_128(1), &overflow);
+    }
+    return result;
+}
+
+
+/* Divide and round up (positive divisor; no overflows) */
+static NPY_INLINE npy_extint128_t
+ceildiv_128_64(npy_extint128_t a, npy_int64 b)
+{
+    npy_extint128_t result;
+    npy_int64 remainder;
+    char overflow = 0;
+    assert(b > 0);
+    result = divmod_128_64(a, b, &remainder);
+    if (a.sign > 0 && remainder != 0) {
+        result = add_128(result, to_128(1), &overflow);
+    }
+    return result;
+}
+
+#endif
diff --git a/numpy/core/tests/test_extint128.py b/numpy/core/tests/test_extint128.py
new file mode 100644
index 000000000..2afae2f6b
--- /dev/null
+++ b/numpy/core/tests/test_extint128.py
@@ -0,0 +1,225 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import itertools
+import contextlib
+import operator
+
+import numpy as np
+import numpy.core.multiarray_tests as mt
+from numpy.compat import long
+
+from numpy.testing import assert_raises, assert_equal
+
+
+INT64_MAX = np.iinfo(np.int64).max
+INT64_MIN = np.iinfo(np.int64).min
+INT64_MID = 2**32
+
+# int128 is not two's complement, the sign bit is separate
+INT128_MAX = 2**128 - 1
+INT128_MIN = -INT128_MAX
+INT128_MID = 2**64
+
+INT64_VALUES = (
+    [INT64_MIN + j for j in range(20)] +
+    [INT64_MAX - j for j in range(20)] +
+    [INT64_MID + j for j in range(-20, 20)] +
+    [2*INT64_MID + j for j in range(-20, 20)] +
+    [INT64_MID//2 + j for j in range(-20, 20)] +
+    list(range(-70, 70))
+)
+
+INT128_VALUES = (
+    [INT128_MIN + j for j in range(20)] +
+    [INT128_MAX - j for j in range(20)] +
+    [INT128_MID + j for j in range(-20, 20)] +
+    [2*INT128_MID + j for j in range(-20, 20)] +
+    [INT128_MID//2 + j for j in range(-20, 20)] +
+    list(range(-70, 70)) +
+    [False]  # negative zero
+)
+
+INT64_POS_VALUES = [x for x in INT64_VALUES if x > 0]
+
+
+@contextlib.contextmanager
+def exc_iter(*args):
+    """
+    Iterate over Cartesian product of *args, and if an exception is raised,
+    add information of the current iterate.
+    """
+
+    value = [None]
+
+    def iterate():
+        for v in itertools.product(*args):
+            value[0] = v
+            yield v
+
+    try:
+        yield iterate()
+    except:
+        import traceback
+        msg = "At: %r\n%s" % (repr(value[0]),
+                              traceback.format_exc())
+        raise AssertionError(msg)
+
+
+def test_safe_binop():
+    # Test checked arithmetic routines
+
+    ops = [
+        (operator.add, 1),
+        (operator.sub, 2),
+        (operator.mul, 3)
+    ]
+
+    with exc_iter(ops, INT64_VALUES, INT64_VALUES) as it:
+        for xop, a, b in it:
+            pyop, op = xop
+            c = pyop(a, b)
+
+            if not (INT64_MIN <= c <= INT64_MAX):
+                assert_raises(OverflowError, mt.extint_safe_binop, a, b, op)
+            else:
+                d = mt.extint_safe_binop(a, b, op)
+                if c != d:
+                    # assert_equal is slow
+                    assert_equal(d, c)
+
+
+def test_to_128():
+    with exc_iter(INT64_VALUES) as it:
+        for a, in it:
+            b = mt.extint_to_128(a)
+            if a != b:
+                assert_equal(b, a)
+
+
+def test_to_64():
+    with exc_iter(INT128_VALUES) as it:
+        for a, in it:
+            if not (INT64_MIN <= a <= INT64_MAX):
+                assert_raises(OverflowError, mt.extint_to_64, a)
+            else:
+                b = mt.extint_to_64(a)
+                if a != b:
+                    assert_equal(b, a)
+
+
+def test_mul_64_64():
+    with exc_iter(INT64_VALUES, INT64_VALUES) as it:
+        for a, b in it:
+            c = a * b
+            d = mt.extint_mul_64_64(a, b)
+            if c != d:
+                assert_equal(d, c)
+
+
+def test_add_128():
+    with exc_iter(INT128_VALUES, INT128_VALUES) as it:
+        for a, b in it:
+            c = a + b
+            if not (INT128_MIN <= c <= INT128_MAX):
+                assert_raises(OverflowError, mt.extint_add_128, a, b)
+            else:
+                d = mt.extint_add_128(a, b)
+                if c != d:
+                    assert_equal(d, c)
+
+
+def test_sub_128():
+    with exc_iter(INT128_VALUES, INT128_VALUES) as it:
+        for a, b in it:
+            c = a - b
+            if not (INT128_MIN <= c <= INT128_MAX):
+                assert_raises(OverflowError, mt.extint_sub_128, a, b)
+            else:
+                d = mt.extint_sub_128(a, b)
+                if c != d:
+                    assert_equal(d, c)
+
+
+def test_neg_128():
+    with exc_iter(INT128_VALUES) as it:
+        for a, in it:
+            b = -a
+            c = mt.extint_neg_128(a)
+            if b != c:
+                assert_equal(c, b)
+
+
+def test_shl_128():
+    with exc_iter(INT128_VALUES) as it:
+        for a, in it:
+            if a < 0:
+                b = -(((-a) << 1) & (2**128-1))
+            else:
+                b = (a << 1) & (2**128-1)
+            c = mt.extint_shl_128(a)
+            if b != c:
+                assert_equal(c, b)
+
+
+def test_shr_128():
+    with exc_iter(INT128_VALUES) as it:
+        for a, in it:
+            if a < 0:
+                b = -((-a) >> 1)
+            else:
+                b = a >> 1
+            c = mt.extint_shr_128(a)
+            if b != c:
+                assert_equal(c, b)
+
+
+def test_gt_128():
+    with exc_iter(INT128_VALUES, INT128_VALUES) as it:
+        for a, b in it:
+            c = a > b
+            d = mt.extint_gt_128(a, b)
+            if c != d:
+                assert_equal(d, c)
+
+
+def test_divmod_128_64():
+    with exc_iter(INT128_VALUES, INT64_POS_VALUES) as it:
+        for a, b in it:
+            if a >= 0:
+                c, cr = divmod(a, b)
+            else:
+                c, cr = divmod(-a, b)
+                c = -c
+                cr = -cr
+
+            d, dr = mt.extint_divmod_128_64(a, b)
+
+            if c != d or d != dr or b*d + dr != a:
+                assert_equal(d, c)
+                assert_equal(dr, cr)
+                assert_equal(b*d + dr, a)
+
+
+def test_floordiv_128_64():
+    with exc_iter(INT128_VALUES, INT64_POS_VALUES) as it:
+        for a, b in it:
+            c = a // b
+            d = mt.extint_floordiv_128_64(a, b)
+
+            if c != d:
+                assert_equal(d, c)
+
+
+def test_ceildiv_128_64():
+    with exc_iter(INT128_VALUES, INT64_POS_VALUES) as it:
+        for a, b in it:
+            c = (a + b - 1) // b
+            d = mt.extint_ceildiv_128_64(a, b)
+
+            if c != d:
+                assert_equal(d, c)
+
+
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/core/tests/test_mem_overlap.py b/numpy/core/tests/test_mem_overlap.py
new file mode 100644
index 000000000..728cc675d
--- /dev/null
+++ b/numpy/core/tests/test_mem_overlap.py
@@ -0,0 +1,486 @@
+from __future__ import division, absolute_import, print_function
+
+import sys
+import itertools
+
+import numpy as np
+from numpy.testing import run_module_suite, assert_, assert_raises, assert_equal
+
+from numpy.core.multiarray_tests import solve_diophantine, internal_overlap
+from numpy.lib.stride_tricks import as_strided
+from numpy.compat import long
+
+if sys.version_info[0] >= 3:
+    xrange = range
+
+
+ndims = 2
+size = 10
+shape = tuple([size] * ndims)
+
+MAY_SHARE_BOUNDS = 0
+MAY_SHARE_EXACT = -1
+
+
+def _indices_for_nelems(nelems):
+    """Returns slices of length nelems, from start onwards, in direction sign."""
+
+    if nelems == 0:
+        return [size // 2]  # int index
+
+    res = []
+    for step in (1, 2):
+        for sign in (-1, 1):
+            start = size // 2 - nelems * step * sign // 2
+            stop = start + nelems * step * sign
+            res.append(slice(start, stop, step * sign))
+
+    return res
+
+
+def _indices_for_axis():
+    """Returns (src, dst) pairs of indices."""
+
+    res = []
+    for nelems in (0, 2, 3):
+        ind = _indices_for_nelems(nelems)
+
+        # no itertools.product available in Py2.4
+        res.extend([(a, b) for a in ind for b in ind])  # all assignments of size "nelems"
+
+    return res
+
+
+def _indices(ndims):
+    """Returns ((axis0_src, axis0_dst), (axis1_src, axis1_dst), ... ) index pairs."""
+
+    ind = _indices_for_axis()
+
+    # no itertools.product available in Py2.4
+
+    res = [[]]
+    for i in range(ndims):
+        newres = []
+        for elem in ind:
+            for others in res:
+                newres.append([elem] + others)
+        res = newres
+
+    return res
+
+
+def _check_assignment(srcidx, dstidx):
+    """Check assignment arr[dstidx] = arr[srcidx] works."""
+
+    arr = np.arange(np.product(shape)).reshape(shape)
+
+    cpy = arr.copy()
+
+    cpy[dstidx] = arr[srcidx]
+    arr[dstidx] = arr[srcidx]
+
+    assert np.all(arr == cpy), 'assigning arr[%s] = arr[%s]' % (dstidx, srcidx)
+
+
+def test_overlapping_assignments():
+    """Test automatically generated assignments which overlap in memory."""
+
+    inds = _indices(ndims)
+
+    for ind in inds:
+        srcidx = tuple([a[0] for a in ind])
+        dstidx = tuple([a[1] for a in ind])
+
+        yield _check_assignment, srcidx, dstidx
+
+
+def test_diophantine_fuzz():
+    # Fuzz test the diophantine solver
+    rng = np.random.RandomState(1234)
+
+    max_int = np.iinfo(np.intp).max
+
+    for ndim in range(10):
+        feasible_count = 0
+        infeasible_count = 0
+
+        min_count = 500//(ndim + 1)
+
+        numbers = []
+        while min(feasible_count, infeasible_count) < min_count:
+            # Ensure big and small integer problems
+            A_max = 1 + rng.randint(0, 11)**6
+            U_max = rng.randint(0, 11)**6
+
+            A_max = min(max_int, A_max)
+            U_max = min(max_int-1, U_max)
+
+            A = tuple(rng.randint(1, A_max+1) for j in range(ndim))
+            U = tuple(rng.randint(0, U_max+2) for j in range(ndim))
+
+            b_ub = min(max_int-2, sum(a*ub for a, ub in zip(A, U)))
+            b = rng.randint(-1, b_ub+2)
+
+            if ndim == 0 and feasible_count < min_count:
+                b = 0
+
+            X = solve_diophantine(A, U, b)
+
+            if X is None:
+                # Check the simplified decision problem agrees
+                X_simplified = solve_diophantine(A, U, b, simplify=1)
+                assert X_simplified is None, (A, U, b, X_simplified)
+
+                # Check no solution exists (provided the problem is
+                # small enough so that brute force checking doesn't
+                # take too long)
+                try:
+                    ranges = tuple(xrange(0, a*ub+1, a) for a, ub in zip(A, U))
+                except OverflowError:
+                    # xrange on 32-bit Python 2 may overflow
+                    continue
+
+                size = 1
+                for r in ranges:
+                    size *= len(r)
+                if size < 100000:
+                    assert_(not any(sum(w) == b for w in itertools.product(*ranges)))
+                    infeasible_count += 1
+            else:
+                # Check the simplified decision problem agrees
+                X_simplified = solve_diophantine(A, U, b, simplify=1)
+                assert X_simplified is not None, (A, U, b, X_simplified)
+
+                # Check validity
+                assert_(sum(a*x for a, x in zip(A, X)) == b)
+                assert_(all(0 <= x <= ub for x, ub in zip(X, U)))
+                feasible_count += 1
+
+
+def test_diophantine_overflow():
+    # Smoke test integer overflow detection
+    max_intp = np.iinfo(np.intp).max
+    max_int64 = np.iinfo(np.int64).max
+
+    if max_int64 <= max_intp:
+        # Check that the algorithm works internally in 128-bit;
+        # solving this problem requires large intermediate numbers
+        A = (max_int64//2, max_int64//2 - 10)
+        U = (max_int64//2, max_int64//2 - 10)
+        b = 2*(max_int64//2) - 10
+
+        assert_equal(solve_diophantine(A, U, b), (1, 1))
+
+
+def check_may_share_memory_exact(a, b):
+    got = np.may_share_memory(a, b, max_work=MAY_SHARE_EXACT)
+
+    assert_equal(np.may_share_memory(a, b),
+                 np.may_share_memory(a, b, max_work=MAY_SHARE_BOUNDS))
+
+    a.fill(0)
+    b.fill(0)
+    a.fill(1)
+    exact = b.any()
+
+    err_msg = ""
+    if got != exact:
+        err_msg = "    " + "\n    ".join([
+            "base_a - base_b = %r" % (a.__array_interface__['data'][0] - b.__array_interface__['data'][0],),
+            "shape_a = %r" % (a.shape,),
+            "shape_b = %r" % (b.shape,),
+            "strides_a = %r" % (a.strides,),
+            "strides_b = %r" % (b.strides,),
+            "size_a = %r" % (a.size,),
+            "size_b = %r" % (b.size,)
+        ])
+
+    assert_equal(got, exact, err_msg=err_msg)
+
+
+def test_may_share_memory_manual():
+    # Manual test cases for may_share_memory
+
+    # Base arrays
+    xs0 = [
+        np.zeros([13, 21, 23, 22], dtype=np.int8),
+        np.zeros([13, 21, 23*2, 22], dtype=np.int8)[:,:,::2,:]
+    ]
+
+    # Generate all negative stride combinations
+    xs = []
+    for x in xs0:
+        for ss in itertools.product(*(([slice(None), slice(None, None, -1)],)*4)):
+            xp = x[ss]
+            xs.append(xp)
+
+    for x in xs:
+        # The default is a simple extent check
+        assert_(np.may_share_memory(x[:,0,:], x[:,1,:]))
+        assert_(np.may_share_memory(x[:,0,:], x[:,1,:], max_work=None))
+
+        # Exact checks
+        check_may_share_memory_exact(x[:,0,:], x[:,1,:])
+        check_may_share_memory_exact(x[:,::7], x[:,3::3])
+
+        try:
+            xp = x.ravel()
+            if xp.flags.owndata:
+                continue
+            xp = xp.view(np.int16)
+        except ValueError:
+            continue
+
+        # 0-size arrays cannot overlap
+        check_may_share_memory_exact(x.ravel()[6:6],
+                                     xp.reshape(13, 21, 23, 11)[:,::7])
+
+        # Test itemsize is dealt with
+        check_may_share_memory_exact(x[:,::7],
+                                     xp.reshape(13, 21, 23, 11))
+        check_may_share_memory_exact(x[:,::7],
+                                     xp.reshape(13, 21, 23, 11)[:,3::3])
+        check_may_share_memory_exact(x.ravel()[6:7],
+                                     xp.reshape(13, 21, 23, 11)[:,::7])
+
+    # Check unit size
+    x = np.zeros([1], dtype=np.int8)
+    check_may_share_memory_exact(x, x)
+    check_may_share_memory_exact(x, x.copy())
+
+
+def check_may_share_memory_easy_fuzz(get_max_work, same_steps, min_count):
+    # Check that overlap problems with common strides are solved with
+    # little work.
+    x = np.zeros([17,34,71,97], dtype=np.int16)
+
+    rng = np.random.RandomState(1234)
+
+    def random_slice(n, step):
+        start = rng.randint(0, n+1)
+        stop = rng.randint(start, n+1)
+        if rng.randint(0, 2) == 0:
+            stop, start = start, stop
+            step *= -1
+        return slice(start, stop, step)
+
+    feasible = 0
+    infeasible = 0
+
+    while min(feasible, infeasible) < min_count:
+        steps = tuple(rng.randint(1, 11) if rng.randint(0, 5) == 0 else 1
+                      for j in range(x.ndim))
+        if same_steps:
+            steps2 = steps
+        else:
+            steps2 = tuple(rng.randint(1, 11) if rng.randint(0, 5) == 0 else 1
+                           for j in range(x.ndim))
+
+        t1 = np.arange(x.ndim)
+        rng.shuffle(t1)
+
+        t2 = np.arange(x.ndim)
+        rng.shuffle(t2)
+
+        s1 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps))
+        s2 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps2))
+        a = x[s1].transpose(t1)
+        b = x[s2].transpose(t2)
+
+        bounds_overlap = np.may_share_memory(a, b)
+        may_share_answer = np.may_share_memory(a, b)
+        easy_answer = np.may_share_memory(a, b, max_work=get_max_work(a, b))
+        exact_answer = np.may_share_memory(a, b, max_work=MAY_SHARE_EXACT)
+
+        if easy_answer != exact_answer:
+            # assert_equal is slow...
+            assert_equal(easy_answer, exact_answer, err_msg=repr((s1, s2)))
+
+        if may_share_answer != bounds_overlap:
+            assert_equal(may_share_answer, bounds_overlap,
+                         err_msg=repr((s1, s2)))
+
+        if bounds_overlap:
+            if exact_answer:
+                feasible += 1
+            else:
+                infeasible += 1
+
+
+def test_may_share_memory_easy_fuzz():
+    # Check that overlap problems with common strides are always
+    # solved with little work.
+
+    check_may_share_memory_easy_fuzz(get_max_work=lambda a, b: 1,
+                                     same_steps=True,
+                                     min_count=2000)
+
+
+def test_may_share_memory_harder_fuzz():
+    # Overlap problems with not necessarily common strides take more
+    # work.
+    #
+    # The work bound below can't be reduced much. Harder problems can
+    # also exist but not be detected here, as the set of problems
+    # comes from RNG.
+
+    check_may_share_memory_easy_fuzz(get_max_work=lambda a, b: max(a.size, b.size)//2,
+                                     same_steps=False,
+                                     min_count=2000)
+
+
+def test_shares_memory_api():
+    x = np.zeros([4, 5, 6], dtype=np.int8)
+
+    assert_equal(np.shares_memory(x, x), True)
+    assert_equal(np.shares_memory(x, x.copy()), False)
+
+    a = x[:,::2,::3]
+    b = x[:,::3,::2]
+    assert_equal(np.shares_memory(a, b), True)
+    assert_equal(np.shares_memory(a, b, max_work=None), True)
+    assert_raises(np.TooHardError, np.shares_memory, a, b, max_work=1)
+    assert_raises(np.TooHardError, np.shares_memory, a, b, max_work=long(1))
+
+
+def test_internal_overlap_diophantine():
+    def check(A, U, exists=None):
+        X = solve_diophantine(A, U, 0, require_ub_nontrivial=1)
+
+        if exists is None:
+            exists = (X is not None)
+
+        if X is not None:
+            assert_(sum(a*x for a, x in zip(A, X)) == sum(a*u//2 for a, u in zip(A, U)))
+            assert_(all(0 <= x <= u for x, u in zip(X, U)))
+            assert_(any(x != u//2 for x, u in zip(X, U)))
+
+        if exists:
+            assert_(X is not None, repr(X))
+        else:
+            assert_(X is None, repr(X))
+
+    # Smoke tests
+    check((3, 2), (2*2, 3*2), exists=True)
+    check((3*2, 2), (15*2, (3-1)*2), exists=False)
+
+
+def test_internal_overlap_slices():
+    # Slicing an array never generates internal overlap
+
+    x = np.zeros([17,34,71,97], dtype=np.int16)
+
+    rng = np.random.RandomState(1234)
+
+    def random_slice(n, step):
+        start = rng.randint(0, n+1)
+        stop = rng.randint(start, n+1)
+        if rng.randint(0, 2) == 0:
+            stop, start = start, stop
+            step *= -1
+        return slice(start, stop, step)
+
+    cases = 0
+    min_count = 5000
+
+    while cases < min_count:
+        steps = tuple(rng.randint(1, 11) if rng.randint(0, 5) == 0 else 1
+                      for j in range(x.ndim))
+        t1 = np.arange(x.ndim)
+        rng.shuffle(t1)
+        s1 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps))
+        a = x[s1].transpose(t1)
+
+        assert not internal_overlap(a)
+        cases += 1
+
+
+def check_internal_overlap(a, manual_expected=None):
+    got = internal_overlap(a)
+
+    # Brute-force check
+    m = set()
+    ranges = tuple(xrange(n) for n in a.shape)
+    for v in itertools.product(*ranges):
+        offset = sum(s*w for s, w in zip(a.strides, v))
+        if offset in m:
+            expected = True
+            break
+        else:
+            m.add(offset)
+    else:
+        expected = False
+
+    # Compare
+    if got != expected:
+        assert_equal(got, expected, err_msg=repr((a.strides, a.shape)))
+    if manual_expected is not None and expected != manual_expected:
+        assert_equal(expected, manual_expected)
+    return got
+
+
+def test_internal_overlap_manual():
+    # Stride tricks can construct arrays with internal overlap
+
+    # We don't care about memory bounds, the array is not
+    # read/write accessed
+    x = np.arange(1).astype(np.int8)
+
+    # Check low-dimensional special cases
+
+    check_internal_overlap(x, False) # 1-dim
+    check_internal_overlap(x.reshape([]), False) # 0-dim
+
+    a = as_strided(x, strides=(3, 4), shape=(4, 4))
+    check_internal_overlap(a, False)
+
+    a = as_strided(x, strides=(3, 4), shape=(5, 4))
+    check_internal_overlap(a, True)
+
+    a = as_strided(x, strides=(0,), shape=(0,))
+    check_internal_overlap(a, False)
+
+    a = as_strided(x, strides=(0,), shape=(1,))
+    check_internal_overlap(a, False)
+
+    a = as_strided(x, strides=(0,), shape=(2,))
+    check_internal_overlap(a, True)
+
+    a = as_strided(x, strides=(0, -9993), shape=(87, 22))
+    check_internal_overlap(a, True)
+
+    a = as_strided(x, strides=(0, -9993), shape=(1, 22))
+    check_internal_overlap(a, False)
+
+    a = as_strided(x, strides=(0, -9993), shape=(0, 22))
+    check_internal_overlap(a, False)
+
+
+def test_internal_overlap_fuzz():
+    # Fuzz check; the brute-force check is fairly slow
+
+    x = np.arange(1).astype(np.int8)
+
+    overlap = 0
+    no_overlap = 0
+    min_count = 100
+
+    rng = np.random.RandomState(1234)
+
+    while min(overlap, no_overlap) < min_count:
+        ndim = rng.randint(1, 4)
+
+        strides = tuple(rng.randint(-1000, 1000) for j in range(ndim))
+        shape = tuple(rng.randint(1, 30) for j in range(ndim))
+
+        a = as_strided(x, strides=strides, shape=shape)
+        result = check_internal_overlap(a)
+
+        if result:
+            overlap += 1
+        else:
+            no_overlap += 1
+
+
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/core/tests/test_multiarray_assignment.py b/numpy/core/tests/test_multiarray_assignment.py
deleted file mode 100644
index 86e1b125e..000000000
--- a/numpy/core/tests/test_multiarray_assignment.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from __future__ import division, absolute_import, print_function
-
-import numpy as np
-from numpy.testing import run_module_suite
-
-ndims = 2
-size = 10
-shape = tuple([size] * ndims)
-
-
-def _indices_for_nelems(nelems):
-    """Returns slices of length nelems, from start onwards, in direction sign."""
-
-    if nelems == 0:
-        return [size // 2]  # int index
-
-    res = []
-    for step in (1, 2):
-        for sign in (-1, 1):
-            start = size // 2 - nelems * step * sign // 2
-            stop = start + nelems * step * sign
-            res.append(slice(start, stop, step * sign))
-
-    return res
-
-
-def _indices_for_axis():
-    """Returns (src, dst) pairs of indices."""
-
-    res = []
-    for nelems in (0, 2, 3):
-        ind = _indices_for_nelems(nelems)
-
-        # no itertools.product available in Py2.4
-        res.extend([(a, b) for a in ind for b in ind])  # all assignments of size "nelems"
-
-    return res
-
-
-def _indices(ndims):
-    """Returns ((axis0_src, axis0_dst), (axis1_src, axis1_dst), ... ) index pairs."""
-
-    ind = _indices_for_axis()
-
-    # no itertools.product available in Py2.4
-
-    res = [[]]
-    for i in range(ndims):
-        newres = []
-        for elem in ind:
-            for others in res:
-                newres.append([elem] + others)
-        res = newres
-
-    return res
-
-
-def _check_assignment(srcidx, dstidx):
-    """Check assignment arr[dstidx] = arr[srcidx] works."""
-
-    arr = np.arange(np.product(shape)).reshape(shape)
-
-    cpy = arr.copy()
-
-    cpy[dstidx] = arr[srcidx]
-    arr[dstidx] = arr[srcidx]
-
-    assert np.all(arr == cpy), 'assigning arr[%s] = arr[%s]' % (dstidx, srcidx)
-
-
-def test_overlapping_assignments():
-    """Test automatically generated assignments which overlap in memory."""
-
-    inds = _indices(ndims)
-
-    for ind in inds:
-        srcidx = tuple([a[0] for a in ind])
-        dstidx = tuple([a[1] for a in ind])
-
-        yield _check_assignment, srcidx, dstidx
-
-
-if __name__ == "__main__":
-    run_module_suite()