60 files changed, 2040 insertions, 1407 deletions
diff --git a/numpy/__init__.py b/numpy/__init__.py
index 77b1d924d..b912d2222 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -139,9 +139,7 @@ else:
         loader = PackageLoader(infunc=True)
         return loader(*packages, **options)
 
-    from . import add_newdocs
-    __all__ = ['add_newdocs',
-               'ModuleDeprecationWarning',
+    __all__ = ['ModuleDeprecationWarning',
                'VisibleDeprecationWarning']
 
     pkgload.__doc__ = PackageLoader.__call__.__doc__
@@ -191,7 +189,7 @@ else:
     from .testing import Tester
 
     # Pytest testing
-    from numpy.testing._private.pytesttester import PytestTester
+    from numpy._pytesttester import PytestTester
     test = PytestTester(__name__)
     del PytestTester
 
@@ -214,7 +212,9 @@ else:
         except AssertionError:
             msg = ("The current Numpy installation ({!r}) fails to "
                    "pass simple sanity checks. This can be caused for example "
-                   "by incorrect BLAS library being linked in.")
+                   "by incorrect BLAS library being linked in, or by mixing "
+                   "package managers (pip, conda, apt, ...). Search closed "
+                   "numpy issues for similar problems.")
             raise RuntimeError(msg.format(__file__))
 
     _sanity_check()
diff --git a/numpy/testing/_private/pytesttester.py b/numpy/_pytesttester.py
index 8c73fafa4..6a1b3274e 100644
--- a/numpy/testing/_private/pytesttester.py
+++ b/numpy/_pytesttester.py
@@ -5,7 +5,7 @@ This module implements the ``test()`` function for NumPy modules. The usual
 boiler plate for doing that is to put the following in the module
 ``__init__.py`` file::
 
-    from numpy.testing import PytestTester
+    from numpy._pytesttester import PytestTester
     test = PytestTester(__name__).test
     del PytestTester
 
@@ -23,6 +23,9 @@ whether or not that file is found as follows:
 In practice, tests run from the numpy repo are run in develop mode. That
 includes the standard ``python runtests.py`` invocation.
 
+This module is imported by every numpy subpackage, so lies at the top level to
+simplify circular import issues. For the same reason, it contains no numpy
+imports at module scope, instead importing numpy within function calls.
 """
 from __future__ import division, absolute_import, print_function
 
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index 4d9cbf5da..9ef30b018 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -59,6 +59,10 @@ del nt
 from .fromnumeric import amax as max, amin as min, round_ as round
 from .numeric import absolute as abs
 
+# do this after everything else, to minimize the chance of this misleadingly
+# appearing in an import-time traceback
+from . import _add_newdocs
+
 __all__ = ['char', 'rec', 'memmap']
 __all__ += numeric.__all__
 __all__ += fromnumeric.__all__
@@ -100,6 +104,6 @@ del copyreg
 del sys
 del _ufunc_reduce
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/add_newdocs.py b/numpy/core/_add_newdocs.py
index d87c964c2..b65920fde 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -10,7 +10,7 @@ NOTE: Many of the methods of ndarray have corresponding functions.
 """
 from __future__ import division, absolute_import, print_function
 
-from numpy.lib import add_newdoc
+from numpy.core.function_base import add_newdoc
 
 ###############################################################################
 #
@@ -605,6 +605,7 @@ add_newdoc('numpy.core', 'broadcast',
 
     Examples
     --------
+
     Manually adding two vectors, using broadcasting:
 
     >>> x = np.array([[1], [2], [3]])
@@ -1318,6 +1319,7 @@ add_newdoc('numpy.core.multiarray', 'concatenate',
     hstack : Stack arrays in sequence horizontally (column wise)
     vstack : Stack arrays in sequence vertically (row wise)
     dstack : Stack arrays in sequence depth wise (along third dimension)
+    block : Assemble arrays from blocks.
 
     Notes
     -----
@@ -1347,19 +1349,19 @@ add_newdoc('numpy.core.multiarray', 'concatenate',
     >>> a[1] = np.ma.masked
     >>> b = np.arange(2, 5)
     >>> a
-    masked_array(data = [0 -- 2],
-                 mask = [False  True False],
-           fill_value = 999999)
+    masked_array(data=[0, --, 2],
+                 mask=[False,  True, False],
+           fill_value=999999)
     >>> b
     array([2, 3, 4])
     >>> np.concatenate([a, b])
-    masked_array(data = [0 1 2 2 3 4],
-                 mask = False,
-           fill_value = 999999)
+    masked_array(data=[0, 1, 2, 2, 3, 4],
+                 mask=False,
+           fill_value=999999)
     >>> np.ma.concatenate([a, b])
-    masked_array(data = [0 -- 2 2 3 4],
-                 mask = [False  True False False False False],
-           fill_value = 999999)
+    masked_array(data=[0, --, 2, 2, 3, 4],
+                 mask=[False,  True, False, False, False, False],
+           fill_value=999999)
 
     """)
 
@@ -1576,71 +1578,72 @@ add_newdoc('numpy.core.multiarray', 'where',
     """
     where(condition, [x, y])
 
-    Return elements, either from `x` or `y`, depending on `condition`.
+    Return elements chosen from `x` or `y` depending on `condition`.
 
-    If only `condition` is given, return ``condition.nonzero()``.
+    .. note::
+        When only `condition` is provided, this function is a shorthand for
+        ``np.asarray(condition).nonzero()``. Using `nonzero` directly should be
+        preferred, as it behaves correctly for subclasses. The rest of this
+        documentation covers only the case where all three arguments are
+        provided.
 
     Parameters
     ----------
     condition : array_like, bool
-        When True, yield `x`, otherwise yield `y`.
-    x, y : array_like, optional
+        Where True, yield `x`, otherwise yield `y`.
+    x, y : array_like
         Values from which to choose. `x`, `y` and `condition` need to be
         broadcastable to some shape.
 
     Returns
     -------
-    out : ndarray or tuple of ndarrays
-        If both `x` and `y` are specified, the output array contains
-        elements of `x` where `condition` is True, and elements from
-        `y` elsewhere.
-
-        If only `condition` is given, return the tuple
-        ``condition.nonzero()``, the indices where `condition` is True.
+    out : ndarray
+        An array with elements from `x` where `condition` is True, and elements
+        from `y` elsewhere.
 
     See Also
     --------
-    nonzero, choose
+    choose
+    nonzero : The function that is called when x and y are omitted
 
     Notes
     -----
-    If `x` and `y` are given and input arrays are 1-D, `where` is
-    equivalent to::
+    If all the arrays are 1-D, `where` is equivalent to::
 
-        [xv if c else yv for (c,xv,yv) in zip(condition,x,y)]
+        [xv if c else yv
+         for c, xv, yv in zip(condition, x, y)]
 
     Examples
     --------
+    >>> a = np.arange(10)
+    >>> a
+    array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    >>> np.where(a < 5, a, 10*a)
+    array([ 0,  1,  2,  3,  4, 50, 60, 70, 80, 90])
+
+    This can be used on multidimensional arrays too:
+
     >>> np.where([[True, False], [True, True]],
     ...          [[1, 2], [3, 4]],
     ...          [[9, 8], [7, 6]])
     array([[1, 8],
            [3, 4]])
 
-    >>> np.where([[0, 1], [1, 0]])
-    (array([0, 1]), array([1, 0]))
-
-    >>> x = np.arange(9.).reshape(3, 3)
-    >>> np.where( x > 5 )
-    (array([2, 2, 2]), array([0, 1, 2]))
-    >>> x[np.where( x > 3.0 )]               # Note: result is 1D.
-    array([ 4.,  5.,  6.,  7.,  8.])
-    >>> np.where(x < 5, x, -1)               # Note: broadcasting.
-    array([[ 0.,  1.,  2.],
-           [ 3.,  4., -1.],
-           [-1., -1., -1.]])
-
-    Find the indices of elements of `x` that are in `goodvalues`.
-
-    >>> goodvalues = [3, 4, 7]
-    >>> ix = np.isin(x, goodvalues)
-    >>> ix
-    array([[False, False, False],
-           [ True,  True, False],
-           [False,  True, False]])
-    >>> np.where(ix)
-    (array([1, 1, 2]), array([0, 1, 1]))
+    The shapes of x, y, and the condition are broadcast together:
+
+    >>> x, y = np.ogrid[:3, :4]
+    >>> np.where(x < y, x, 10 + y)  # both x and 10+y are broadcast
+    array([[10,  0,  0,  0],
+           [10, 11,  1,  1],
+           [10, 11, 12,  2]])
 
+    >>> a = np.array([[0, 1, 2],
+    ...               [0, 2, 4],
+    ...               [0, 3, 6]])
+    >>> np.where(a < 4, a, -1)  # -1 is broadcast
+    array([[ 0,  1,  2],
+           [ 0,  2, -1],
+           [ 0,  3, -1]])
     """)
 
 
@@ -2265,25 +2268,89 @@ add_newdoc('numpy.core', 'matmul',
 
     """)
 
+add_newdoc('numpy.core', 'vdot',
+    """
+    vdot(a, b)
+
+    Return the dot product of two vectors.
+
+    The vdot(`a`, `b`) function handles complex numbers differently than
+    dot(`a`, `b`).  If the first argument is complex the complex conjugate
+    of the first argument is used for the calculation of the dot product.
+
+    Note that `vdot` handles multidimensional arrays differently than `dot`:
+    it does *not* perform a matrix product, but flattens input arguments
+    to 1-D vectors first. Consequently, it should only be used for vectors.
+
+    Parameters
+    ----------
+    a : array_like
+        If `a` is complex the complex conjugate is taken before calculation
+        of the dot product.
+    b : array_like
+        Second argument to the dot product.
+
+    Returns
+    -------
+    output : ndarray
+        Dot product of `a` and `b`.  Can be an int, float, or
+        complex depending on the types of `a` and `b`.
+
+    See Also
+    --------
+    dot : Return the dot product without using the complex conjugate of the
+          first argument.
+
+    Examples
+    --------
+    >>> a = np.array([1+2j,3+4j])
+    >>> b = np.array([5+6j,7+8j])
+    >>> np.vdot(a, b)
+    (70-8j)
+    >>> np.vdot(b, a)
+    (70+8j)
+
+    Note that higher-dimensional arrays are flattened!
+
+    >>> a = np.array([[1, 4], [5, 6]])
+    >>> b = np.array([[4, 1], [2, 2]])
+    >>> np.vdot(a, b)
+    30
+    >>> np.vdot(b, a)
+    30
+    >>> 1*4 + 4*1 + 5*2 + 6*2
+    30
+
+    """)
 
-add_newdoc('numpy.core', 'c_einsum',
+add_newdoc('numpy.core.multiarray', 'c_einsum',
     """
-    c_einsum(subscripts, *operands, out=None, dtype=None, order='K', casting='safe')
+    c_einsum(subscripts, *operands, out=None, dtype=None, order='K',
+           casting='safe')
+           
+    *This documentation shadows that of the native python implementation of the `einsum` function,
+    except all references and examples related to the `optimize` argument (v 0.12.0) have been removed.*
 
     Evaluates the Einstein summation convention on the operands.
 
-    Using the Einstein summation convention, many common multi-dimensional
-    array operations can be represented in a simple fashion.  This function
-    provides a way to compute such summations. The best way to understand this
-    function is to try the examples below, which show how many common NumPy
-    functions can be implemented as calls to `einsum`.
+    Using the Einstein summation convention, many common multi-dimensional,
+    linear algebraic array operations can be represented in a simple fashion.
+    In *implicit* mode `einsum` computes these values.
 
-    This is the core C function.
+    In *explicit* mode, `einsum` provides further flexibility to compute
+    other array operations that might not be considered classical Einstein
+    summation operations, by disabling, or forcing summation over specified
+    subscript labels.
+
+    See the notes and examples for clarification.
 
     Parameters
     ----------
     subscripts : str
-        Specifies the subscripts for summation.
+        Specifies the subscripts for summation as comma separated list of
+        subscript labels. An implicit (classical Einstein summation)
+        calculation is performed unless the explicit indicator '->' is
+        included as well as subscript labels of the precise output form.
     operands : list of array_like
         These are the arrays for the operation.
     out : ndarray, optional
@@ -2311,6 +2378,11 @@ add_newdoc('numpy.core', 'c_einsum',
           * 'unsafe' means any data conversions may be done.
 
         Default is 'safe'.
+    optimize : {False, True, 'greedy', 'optimal'}, optional
+        Controls if intermediate optimization should occur. No optimization
+        will occur if False and True will default to the 'greedy' algorithm.
+        Also accepts an explicit contraction list from the ``np.einsum_path``
+        function. See ``np.einsum_path`` for more details. Defaults to False.
 
     Returns
     -------
@@ -2319,56 +2391,86 @@ add_newdoc('numpy.core', 'c_einsum',
 
     See Also
     --------
-    einsum, dot, inner, outer, tensordot
+    einsum_path, dot, inner, outer, tensordot, linalg.multi_dot
 
     Notes
     -----
     .. versionadded:: 1.6.0
 
-    The subscripts string is a comma-separated list of subscript labels,
-    where each label refers to a dimension of the corresponding operand.
-    Repeated subscripts labels in one operand take the diagonal.  For example,
-    ``np.einsum('ii', a)`` is equivalent to ``np.trace(a)``.
+    The Einstein summation convention can be used to compute
+    many multi-dimensional, linear algebraic array operations. `einsum`
+    provides a succinct way of representing these.
 
-    Whenever a label is repeated, it is summed, so ``np.einsum('i,i', a, b)``
-    is equivalent to ``np.inner(a,b)``.  If a label appears only once,
-    it is not summed, so ``np.einsum('i', a)`` produces a view of ``a``
-    with no changes.
+    A non-exhaustive list of these operations,
+    which can be computed by `einsum`, is shown below along with examples:
 
-    The order of labels in the output is by default alphabetical.  This
-    means that ``np.einsum('ij', a)`` doesn't affect a 2D array, while
-    ``np.einsum('ji', a)`` takes its transpose.
+    * Trace of an array, :py:func:`numpy.trace`.
+    * Return a diagonal, :py:func:`numpy.diag`.
+    * Array axis summations, :py:func:`numpy.sum`.
+    * Transpositions and permutations, :py:func:`numpy.transpose`.
+    * Matrix multiplication and dot product, :py:func:`numpy.matmul` :py:func:`numpy.dot`.
+    * Vector inner and outer products, :py:func:`numpy.inner` :py:func:`numpy.outer`.
+    * Broadcasting, element-wise and scalar multiplication, :py:func:`numpy.multiply`.
+    * Tensor contractions, :py:func:`numpy.tensordot`.
+    * Chained array operations, in efficient calculation order, :py:func:`numpy.einsum_path`.
 
-    The output can be controlled by specifying output subscript labels
-    as well.  This specifies the label order, and allows summing to
-    be disallowed or forced when desired.  The call ``np.einsum('i->', a)``
-    is like ``np.sum(a, axis=-1)``, and ``np.einsum('ii->i', a)``
-    is like ``np.diag(a)``.  The difference is that `einsum` does not
-    allow broadcasting by default.
+    The subscripts string is a comma-separated list of subscript labels,
+    where each label refers to a dimension of the corresponding operand.
+    Whenever a label is repeated it is summed, so ``np.einsum('i,i', a, b)``
+    is equivalent to :py:func:`np.inner(a,b) <numpy.inner>`. If a label
+    appears only once, it is not summed, so ``np.einsum('i', a)`` produces a
+    view of ``a`` with no changes. A further example ``np.einsum('ij,jk', a, b)``
+    describes traditional matrix multiplication and is equivalent to
+    :py:func:`np.matmul(a,b) <numpy.matmul>`. Repeated subscript labels in one
+    operand take the diagonal. For example, ``np.einsum('ii', a)`` is equivalent
+    to :py:func:`np.trace(a) <numpy.trace>`.
+
+    In *implicit mode*, the chosen subscripts are important
+    since the axes of the output are reordered alphabetically.  This
+    means that ``np.einsum('ij', a)`` doesn't affect a 2D array, while
+    ``np.einsum('ji', a)`` takes its transpose. Additionally,
+    ``np.einsum('ij,jk', a, b)`` returns a matrix multiplication, while,
+    ``np.einsum('ij,jh', a, b)`` returns the transpose of the
+    multiplication since subscript 'h' precedes subscript 'i'.
+
+    In *explicit mode* the output can be directly controlled by
+    specifying output subscript labels.  This requires the
+    identifier '->' as well as the list of output subscript labels.
+    This feature increases the flexibility of the function since
+    summing can be disabled or forced when required. The call
+    ``np.einsum('i->', a)`` is like :py:func:`np.sum(a, axis=-1) <numpy.sum>`,
+    and ``np.einsum('ii->i', a)`` is like :py:func:`np.diag(a) <numpy.diag>`.
+    The difference is that `einsum` does not allow broadcasting by default.
+    Additionally ``np.einsum('ij,jh->ih', a, b)`` directly specifies the
+    order of the output subscript labels and therefore returns matrix
+    multiplication, unlike the example above in implicit mode.
 
     To enable and control broadcasting, use an ellipsis.  Default
     NumPy-style broadcasting is done by adding an ellipsis
     to the left of each term, like ``np.einsum('...ii->...i', a)``.
     To take the trace along the first and last axes,
     you can do ``np.einsum('i...i', a)``, or to do a matrix-matrix
-    product with the left-most indices instead of rightmost, you can do
+    product with the left-most indices instead of rightmost, one can do
     ``np.einsum('ij...,jk...->ik...', a, b)``.
 
     When there is only one operand, no axes are summed, and no output
     parameter is provided, a view into the operand is returned instead
     of a new array.  Thus, taking the diagonal as ``np.einsum('ii->i', a)``
-    produces a view.
+    produces a view (changed in version 1.10.0).
 
-    An alternative way to provide the subscripts and operands is as
-    ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``. The examples
-    below have corresponding `einsum` calls with the two parameter methods.
+    `einsum` also provides an alternative way to provide the subscripts
+    and operands as ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``.
+    If the output shape is not provided in this format `einsum` will be
+    calculated in implicit mode, otherwise it will be performed explicitly.
+    The examples below have corresponding `einsum` calls with the two
+    parameter methods.
 
     .. versionadded:: 1.10.0
 
     Views returned from einsum are now writeable whenever the input array
     is writeable. For example, ``np.einsum('ijk...->kji...', a)`` will now
-    have the same effect as ``np.swapaxes(a, 0, 2)`` and
-    ``np.einsum('ii->i', a)`` will return a writeable view of the diagonal
+    have the same effect as :py:func:`np.swapaxes(a, 0, 2) <numpy.swapaxes>`
+    and ``np.einsum('ii->i', a)`` will return a writeable view of the diagonal
     of a 2D array.
 
     Examples
@@ -2377,6 +2479,8 @@ add_newdoc('numpy.core', 'c_einsum',
     >>> b = np.arange(5)
     >>> c = np.arange(6).reshape(2,3)
 
+    Trace of a matrix:
+
     >>> np.einsum('ii', a)
     60
     >>> np.einsum(a, [0,0])
@@ -2384,6 +2488,8 @@ add_newdoc('numpy.core', 'c_einsum',
     >>> np.trace(a)
     60
 
+    Extract the diagonal (requires explicit form):
+
     >>> np.einsum('ii->i', a)
     array([ 0,  6, 12, 18, 24])
     >>> np.einsum(a, [0,0], [0])
@@ -2391,31 +2497,69 @@ add_newdoc('numpy.core', 'c_einsum',
     >>> np.diag(a)
     array([ 0,  6, 12, 18, 24])
 
-    >>> np.einsum('ij,j', a, b)
-    array([ 30,  80, 130, 180, 230])
-    >>> np.einsum(a, [0,1], b, [1])
-    array([ 30,  80, 130, 180, 230])
-    >>> np.dot(a, b)
-    array([ 30,  80, 130, 180, 230])
-    >>> np.einsum('...j,j', a, b)
-    array([ 30,  80, 130, 180, 230])
+    Sum over an axis (requires explicit form):
+
+    >>> np.einsum('ij->i', a)
+    array([ 10,  35,  60,  85, 110])
+    >>> np.einsum(a, [0,1], [0])
+    array([ 10,  35,  60,  85, 110])
+    >>> np.sum(a, axis=1)
+    array([ 10,  35,  60,  85, 110])
+
+    For higher dimensional arrays summing a single axis can be done with ellipsis:
+
+    >>> np.einsum('...j->...', a)
+    array([ 10,  35,  60,  85, 110])
+    >>> np.einsum(a, [Ellipsis,1], [Ellipsis])
+    array([ 10,  35,  60,  85, 110])
+
+    Compute a matrix transpose, or reorder any number of axes:
 
     >>> np.einsum('ji', c)
     array([[0, 3],
            [1, 4],
            [2, 5]])
+    >>> np.einsum('ij->ji', c)
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
     >>> np.einsum(c, [1,0])
     array([[0, 3],
            [1, 4],
            [2, 5]])
-    >>> c.T
+    >>> np.transpose(c)
     array([[0, 3],
            [1, 4],
            [2, 5]])
 
+    Vector inner products:
+
+    >>> np.einsum('i,i', b, b)
+    30
+    >>> np.einsum(b, [0], b, [0])
+    30
+    >>> np.inner(b,b)
+    30
+
+    Matrix vector multiplication:
+
+    >>> np.einsum('ij,j', a, b)
+    array([ 30,  80, 130, 180, 230])
+    >>> np.einsum(a, [0,1], b, [1])
+    array([ 30,  80, 130, 180, 230])
+    >>> np.dot(a, b)
+    array([ 30,  80, 130, 180, 230])
+    >>> np.einsum('...j,j', a, b)
+    array([ 30,  80, 130, 180, 230])
+
+    Broadcasting and scalar multiplication:
+
     >>> np.einsum('..., ...', 3, c)
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
+    >>> np.einsum(',ij', 3, c)
+    array([[ 0,  3,  6],
+           [ 9, 12, 15]])
     >>> np.einsum(3, [Ellipsis], c, [Ellipsis])
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
@@ -2423,12 +2567,7 @@ add_newdoc('numpy.core', 'c_einsum',
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
 
-    >>> np.einsum('i,i', b, b)
-    30
-    >>> np.einsum(b, [0], b, [0])
-    30
-    >>> np.inner(b,b)
-    30
+    Vector outer product:
 
     >>> np.einsum('i,j', np.arange(2)+1, b)
     array([[0, 1, 2, 3, 4],
@@ -2440,12 +2579,7 @@ add_newdoc('numpy.core', 'c_einsum',
     array([[0, 1, 2, 3, 4],
            [0, 2, 4, 6, 8]])
 
-    >>> np.einsum('i...->...', a)
-    array([50, 55, 60, 65, 70])
-    >>> np.einsum(a, [0,Ellipsis], [Ellipsis])
-    array([50, 55, 60, 65, 70])
-    >>> np.sum(a, axis=0)
-    array([50, 55, 60, 65, 70])
+    Tensor contraction:
 
     >>> a = np.arange(60.).reshape(3,4,5)
     >>> b = np.arange(24.).reshape(4,3,2)
@@ -2468,6 +2602,17 @@ add_newdoc('numpy.core', 'c_einsum',
            [ 4796.,  5162.],
            [ 4928.,  5306.]])
 
+    Writeable returned arrays (since version 1.10.0):
+
+    >>> a = np.zeros((3, 3))
+    >>> np.einsum('ii->i', a)[:] = 1
+    >>> a
+    array([[ 1.,  0.,  0.],
+           [ 0.,  1.,  0.],
+           [ 0.,  0.,  1.]])
+
+    Example of ellipsis use:
+
     >>> a = np.arange(6).reshape((3,2))
     >>> b = np.arange(12).reshape((4,3))
     >>> np.einsum('ki,jk->ij', a, b)
@@ -2480,69 +2625,6 @@ add_newdoc('numpy.core', 'c_einsum',
     array([[10, 28, 46, 64],
            [13, 40, 67, 94]])
 
-    >>> # since version 1.10.0
-    >>> a = np.zeros((3, 3))
-    >>> np.einsum('ii->i', a)[:] = 1
-    >>> a
-    array([[ 1.,  0.,  0.],
-           [ 0.,  1.,  0.],
-           [ 0.,  0.,  1.]])
-
-    """)
-
-add_newdoc('numpy.core', 'vdot',
-    """
-    vdot(a, b)
-
-    Return the dot product of two vectors.
-
-    The vdot(`a`, `b`) function handles complex numbers differently than
-    dot(`a`, `b`).  If the first argument is complex the complex conjugate
-    of the first argument is used for the calculation of the dot product.
-
-    Note that `vdot` handles multidimensional arrays differently than `dot`:
-    it does *not* perform a matrix product, but flattens input arguments
-    to 1-D vectors first. Consequently, it should only be used for vectors.
-
-    Parameters
-    ----------
-    a : array_like
-        If `a` is complex the complex conjugate is taken before calculation
-        of the dot product.
-    b : array_like
-        Second argument to the dot product.
-
-    Returns
-    -------
-    output : ndarray
-        Dot product of `a` and `b`.  Can be an int, float, or
-        complex depending on the types of `a` and `b`.
-
-    See Also
-    --------
-    dot : Return the dot product without using the complex conjugate of the
-          first argument.
-
-    Examples
-    --------
-    >>> a = np.array([1+2j,3+4j])
-    >>> b = np.array([5+6j,7+8j])
-    >>> np.vdot(a, b)
-    (70-8j)
-    >>> np.vdot(b, a)
-    (70+8j)
-
-    Note that higher-dimensional arrays are flattened!
-
-    >>> a = np.array([[1, 4], [5, 6]])
-    >>> b = np.array([[4, 1], [2, 2]])
-    >>> np.vdot(a, b)
-    30
-    >>> np.vdot(b, a)
-    30
-    >>> 1*4 + 4*1 + 5*2 + 6*2
-    30
-
     """)
 
 
@@ -5215,99 +5297,6 @@ add_newdoc('numpy.core.umath', 'seterrobj',
 #
 ##############################################################################
 
-add_newdoc('numpy.core.multiarray', 'digitize',
-    """
-    digitize(x, bins, right=False)
-
-    Return the indices of the bins to which each value in input array belongs.
-
-    =========  =============  ============================
-    `right`    order of bins  returned index `i` satisfies
-    =========  =============  ============================
-    ``False``  increasing     ``bins[i-1] <= x < bins[i]``
-    ``True``   increasing     ``bins[i-1] < x <= bins[i]``
-    ``False``  decreasing     ``bins[i-1] > x >= bins[i]``
-    ``True``   decreasing     ``bins[i-1] >= x > bins[i]``
-    =========  =============  ============================
-
-    If values in `x` are beyond the bounds of `bins`, 0 or ``len(bins)`` is
-    returned as appropriate.
-
-    Parameters
-    ----------
-    x : array_like
-        Input array to be binned. Prior to NumPy 1.10.0, this array had to
-        be 1-dimensional, but can now have any shape.
-    bins : array_like
-        Array of bins. It has to be 1-dimensional and monotonic.
-    right : bool, optional
-        Indicating whether the intervals include the right or the left bin
-        edge. Default behavior is (right==False) indicating that the interval
-        does not include the right edge. The left bin end is open in this
-        case, i.e., bins[i-1] <= x < bins[i] is the default behavior for
-        monotonically increasing bins.
-
-    Returns
-    -------
-    indices : ndarray of ints
-        Output array of indices, of same shape as `x`.
-
-    Raises
-    ------
-    ValueError
-        If `bins` is not monotonic.
-    TypeError
-        If the type of the input is complex.
-
-    See Also
-    --------
-    bincount, histogram, unique, searchsorted
-
-    Notes
-    -----
-    If values in `x` are such that they fall outside the bin range,
-    attempting to index `bins` with the indices that `digitize` returns
-    will result in an IndexError.
-
-    .. versionadded:: 1.10.0
-
-    `np.digitize` is  implemented in terms of `np.searchsorted`. This means
-    that a binary search is used to bin the values, which scales much better
-    for larger number of bins than the previous linear search. It also removes
-    the requirement for the input array to be 1-dimensional.
-
-    For monotonically _increasing_ `bins`, the following are equivalent::
-
-        np.digitize(x, bins, right=True)
-        np.searchsorted(bins, x, side='left')
-
-    Note that as the order of the arguments are reversed, the side must be too.
-    The `searchsorted` call is marginally faster, as it does not do any
-    monotonicity checks. Perhaps more importantly, it supports all dtypes.
-
-    Examples
-    --------
-    >>> x = np.array([0.2, 6.4, 3.0, 1.6])
-    >>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
-    >>> inds = np.digitize(x, bins)
-    >>> inds
-    array([1, 4, 3, 2])
-    >>> for n in range(x.size):
-    ...   print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]])
-    ...
-    0.0 <= 0.2 < 1.0
-    4.0 <= 6.4 < 10.0
-    2.5 <= 3.0 < 4.0
-    1.0 <= 1.6 < 2.5
-
-    >>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.])
-    >>> bins = np.array([0, 5, 10, 15, 20])
-    >>> np.digitize(x,bins,right=True)
-    array([1, 2, 3, 4, 4])
-    >>> np.digitize(x,bins,right=False)
-    array([1, 3, 3, 4, 5])
-    """)
-
 add_newdoc('numpy.core.multiarray', 'bincount',
     """
     bincount(x, weights=None, minlength=0)
@@ -7144,8 +7133,8 @@ add_newdoc('numpy.core.multiarray', 'datetime_data',
 
     Get information about the step size of a date or time type.
 
-    The returned tuple can be passed as the second argument of `datetime64` and
-    `timedelta64`.
+    The returned tuple can be passed as the second argument of `numpy.datetime64` and
+    `numpy.timedelta64`.
 
     Parameters
     ----------
@@ -7175,94 +7164,6 @@ add_newdoc('numpy.core.multiarray', 'datetime_data',
     numpy.datetime64('2010-01-01T00:00:00','25s')
     """)
 
-##############################################################################
-#
-# nd_grid instances
-#
-##############################################################################
-
-add_newdoc('numpy.lib.index_tricks', 'mgrid',
-    """
-    `nd_grid` instance which returns a dense multi-dimensional "meshgrid".
-
-    An instance of `numpy.lib.index_tricks.nd_grid` which returns an dense
-    (or fleshed out) mesh-grid when indexed, so that each returned argument
-    has the same shape.  The dimensions and number of the output arrays are
-    equal to the number of indexing dimensions.  If the step length is not a
-    complex number, then the stop is not inclusive.
-
-    However, if the step length is a **complex number** (e.g. 5j), then
-    the integer part of its magnitude is interpreted as specifying the
-    number of points to create between the start and stop values, where
-    the stop value **is inclusive**.
-
-    Returns
-    ----------
-    mesh-grid `ndarrays` all of the same dimensions
-
-    See Also
-    --------
-    numpy.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
-    ogrid : like mgrid but returns open (not fleshed out) mesh grids
-    r_ : array concatenator
-
-    Examples
-    --------
-    >>> np.mgrid[0:5,0:5]
-    array([[[0, 0, 0, 0, 0],
-            [1, 1, 1, 1, 1],
-            [2, 2, 2, 2, 2],
-            [3, 3, 3, 3, 3],
-            [4, 4, 4, 4, 4]],
-           [[0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4]]])
-    >>> np.mgrid[-1:1:5j]
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-
-    """)
-
-add_newdoc('numpy.lib.index_tricks', 'ogrid',
-    """
-    `nd_grid` instance which returns an open multi-dimensional "meshgrid".
-
-    An instance of `numpy.lib.index_tricks.nd_grid` which returns an open
-    (i.e. not fleshed out) mesh-grid when indexed, so that only one dimension
-    of each returned array is greater than 1.  The dimension and number of the
-    output arrays are equal to the number of indexing dimensions.  If the step
-    length is not a complex number, then the stop is not inclusive.
-
-    However, if the step length is a **complex number** (e.g. 5j), then
-    the integer part of its magnitude is interpreted as specifying the
-    number of points to create between the start and stop values, where
-    the stop value **is inclusive**.
-
-    Returns
-    ----------
-    mesh-grid `ndarrays` with only one dimension :math:`\\neq 1`
-
-    See Also
-    --------
-    np.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
-    mgrid : like `ogrid` but returns dense (or fleshed out) mesh grids
-    r_ : array concatenator
-
-    Examples
-    --------
-    >>> from numpy import ogrid
-    >>> ogrid[-1:1:5j]
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-    >>> ogrid[0:5,0:5]
-    [array([[0],
-            [1],
-            [2],
-            [3],
-            [4]]), array([[0, 1, 2, 3, 4]])]
-
-    """)
-
 
 ##############################################################################
 #
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index a4c18d482..163f125c2 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -4,6 +4,8 @@ Implementation of optimized einsum.
 """
 from __future__ import division, absolute_import, print_function
 
+import itertools
+
 from numpy.compat import basestring
 from numpy.core.multiarray import c_einsum
 from numpy.core.numeric import asarray, asanyarray, result_type, tensordot, dot
@@ -14,6 +16,44 @@ einsum_symbols = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 einsum_symbols_set = set(einsum_symbols)
 
 
+def _flop_count(idx_contraction, inner, num_terms, size_dictionary):
+    """
+    Computes the number of FLOPS in the contraction.
+
+    Parameters
+    ----------
+    idx_contraction : iterable
+        The indices involved in the contraction
+    inner : bool
+        Does this contraction require an inner product?
+    num_terms : int
+        The number of terms in a contraction
+    size_dictionary : dict
+        The size of each of the indices in idx_contraction
+
+    Returns
+    -------
+    flop_count : int
+        The total number of FLOPS required for the contraction.
+
+    Examples
+    --------
+
+    >>> _flop_count('abc', False, 1, {'a': 2, 'b':3, 'c':5})
+    90
+
+    >>> _flop_count('abc', True, 2, {'a': 2, 'b':3, 'c':5})
+    270
+
+    """
+
+    overall_size = _compute_size_by_dict(idx_contraction, size_dictionary)
+    op_factor = max(1, num_terms - 1)
+    if inner:
+        op_factor += 1
+
+    return overall_size * op_factor
+
 def _compute_size_by_dict(indices, idx_dict):
     """
     Computes the product of the elements in indices based on the dictionary
@@ -139,14 +179,9 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
         iter_results = []
 
         # Compute all unique pairs
-        comb_iter = []
-        for x in range(len(input_sets) - iteration):
-            for y in range(x + 1, len(input_sets) - iteration):
-                comb_iter.append((x, y))
-
         for curr in full_results:
             cost, positions, remaining = curr
-            for con in comb_iter:
+            for con in itertools.combinations(range(len(input_sets) - iteration), 2):
 
                 # Find the contraction
                 cont = _find_contraction(con, remaining, output_set)
@@ -157,15 +192,10 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
                 if new_size > memory_limit:
                     continue
 
-                # Find cost
-                new_cost = _compute_size_by_dict(idx_contract, idx_dict)
-                if idx_removed:
-                    new_cost *= 2
-
                 # Build (total_cost, positions, indices_remaining)
-                new_cost += cost
+                total_cost =  cost + _flop_count(idx_contract, idx_removed, len(con), idx_dict)
                 new_pos = positions + [con]
-                iter_results.append((new_cost, new_pos, new_input_sets))
+                iter_results.append((total_cost, new_pos, new_input_sets))
 
         # Update combinatorial list, if we did not find anything return best
         # path + remaining contractions
@@ -183,6 +213,102 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
     path = min(full_results, key=lambda x: x[0])[1]
     return path
 
+def _parse_possible_contraction(positions, input_sets, output_set, idx_dict, memory_limit, path_cost, naive_cost):
+    """Compute the cost (removed size + flops) and resultant indices for
+    performing the contraction specified by ``positions``.
+
+    Parameters
+    ----------
+    positions : tuple of int
+        The locations of the proposed tensors to contract.
+    input_sets : list of sets
+        The indices found on each tensors.
+    output_set : set
+        The output indices of the expression.
+    idx_dict : dict
+        Mapping of each index to its size.
+    memory_limit : int
+        The total allowed size for an intermediary tensor.
+    path_cost : int
+        The contraction cost so far.
+    naive_cost : int
+        The cost of the unoptimized expression.
+
+    Returns
+    -------
+    cost : (int, int)
+        A tuple containing the size of any indices removed, and the flop cost.
+    positions : tuple of int
+        The locations of the proposed tensors to contract.
+    new_input_sets : list of sets
+        The resulting new list of indices if this proposed contraction is performed.
+
+    """
+
+    # Find the contraction
+    contract = _find_contraction(positions, input_sets, output_set)
+    idx_result, new_input_sets, idx_removed, idx_contract = contract
+
+    # Sieve the results based on memory_limit
+    new_size = _compute_size_by_dict(idx_result, idx_dict)
+    if new_size > memory_limit:
+        return None
+
+    # Build sort tuple
+    old_sizes = (_compute_size_by_dict(input_sets[p], idx_dict) for p in positions)
+    removed_size = sum(old_sizes) - new_size
+
+    # NB: removed_size used to be just the size of any removed indices i.e.:
+    #     helpers.compute_size_by_dict(idx_removed, idx_dict)
+    cost = _flop_count(idx_contract, idx_removed, len(positions), idx_dict)
+    sort = (-removed_size, cost)
+
+    # Sieve based on total cost as well
+    if (path_cost + cost) > naive_cost:
+        return None
+
+    # Add contraction to possible choices
+    return [sort, positions, new_input_sets]
+
+
+def _update_other_results(results, best):
+    """Update the positions and provisional input_sets of ``results`` based on
+    performing the contraction result ``best``. Remove any involving the tensors
+    contracted.
+
+    Parameters
+    ----------
+    results : list
+        List of contraction results produced by ``_parse_possible_contraction``.
+    best : list
+        The best contraction of ``results`` i.e. the one that will be performed.
+
+    Returns
+    -------
+    mod_results : list
+        The list of modifed results, updated with outcome of ``best`` contraction.
+    """
+
+    best_con = best[1]
+    bx, by = best_con
+    mod_results = []
+
+    for cost, (x, y), con_sets in results:
+
+        # Ignore results involving tensors just contracted
+        if x in best_con or y in best_con:
+            continue
+
+        # Update the input_sets
+        del con_sets[by - int(by > x) - int(by > y)]
+        del con_sets[bx - int(bx > x) - int(bx > y)]
+        con_sets.insert(-1, best[2][-1])
+
+        # Update the position indices
+        mod_con = x - int(x > bx) - int(x > by), y - int(y > bx) - int(y > by)
+        mod_results.append((cost, mod_con, con_sets))
+
+    return mod_results
 
 def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
     """
@@ -219,46 +345,68 @@ def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
     [(0, 2), (0, 1)]
     """
 
+    # Handle trivial cases that leaked through
     if len(input_sets) == 1:
         return [(0,)]
+    elif len(input_sets) == 2:
+        return [(0, 1)]
+
+    # Build up a naive cost
+    contract = _find_contraction(range(len(input_sets)), input_sets, output_set)
+    idx_result, new_input_sets, idx_removed, idx_contract = contract
+    naive_cost = _flop_count(idx_contract, idx_removed, len(input_sets), idx_dict)
 
+    # Initially iterate over all pairs
+    comb_iter = itertools.combinations(range(len(input_sets)), 2)
+    known_contractions = []
+
+    path_cost = 0
     path = []
-    for iteration in range(len(input_sets) - 1):
-        iteration_results = []
-        comb_iter = []
 
-        # Compute all unique pairs
-        for x in range(len(input_sets)):
-            for y in range(x + 1, len(input_sets)):
-                comb_iter.append((x, y))
+    for iteration in range(len(input_sets) - 1):
 
+        # Iterate over all pairs on first step, only previously found pairs on subsequent steps
         for positions in comb_iter:
 
-            # Find the contraction
-            contract = _find_contraction(positions, input_sets, output_set)
-            idx_result, new_input_sets, idx_removed, idx_contract = contract
-
-            # Sieve the results based on memory_limit
-            if _compute_size_by_dict(idx_result, idx_dict) > memory_limit:
+            # Always initially ignore outer products
+            if input_sets[positions[0]].isdisjoint(input_sets[positions[1]]):
                 continue
 
-            # Build sort tuple
-            removed_size = _compute_size_by_dict(idx_removed, idx_dict)
-            cost = _compute_size_by_dict(idx_contract, idx_dict)
-            sort = (-removed_size, cost)
+            result = _parse_possible_contraction(positions, input_sets, output_set, idx_dict, memory_limit, path_cost,
+                                                 naive_cost)
+            if result is not None:
+                known_contractions.append(result)
+
+        # If we do not have a inner contraction, rescan pairs including outer products
+        if len(known_contractions) == 0:
 
-            # Add contraction to possible choices
-            iteration_results.append([sort, positions, new_input_sets])
+            # Then check the outer products
+            for positions in itertools.combinations(range(len(input_sets)), 2):
+                result = _parse_possible_contraction(positions, input_sets, output_set, idx_dict, memory_limit,
+                                                     path_cost, naive_cost)
+                if result is not None:
+                    known_contractions.append(result)
 
-        # If we did not find a new contraction contract remaining
-        if len(iteration_results) == 0:
-            path.append(tuple(range(len(input_sets))))
-            break
+            # If we still did not find any remaining contractions, default back to einsum like behavior
+            if len(known_contractions) == 0:
+                path.append(tuple(range(len(input_sets))))
+                break
 
         # Sort based on first index
-        best = min(iteration_results, key=lambda x: x[0])
-        path.append(best[1])
+        best = min(known_contractions, key=lambda x: x[0])
+
+        # Now propagate as many unused contractions as possible to next iteration
+        known_contractions = _update_other_results(known_contractions, best)
+
+        # Next iteration only compute contractions with the new tensor
+        # All other contractions have been accounted for
         input_sets = best[2]
+        new_tensor_pos = len(input_sets) - 1
+        comb_iter = ((i, new_tensor_pos) for i in range(new_tensor_pos))
+
+        # Update path and total cost
+        path.append(best[1])
+        path_cost += best[0][1]
 
     return path
 
@@ -314,26 +462,27 @@ def _can_dot(inputs, result, idx_removed):
     if len(inputs) != 2:
         return False
 
-    # Build a few temporaries
     input_left, input_right = inputs
+
+    for c in set(input_left + input_right):
+        # can't deal with repeated indices on same input or more than 2 total
+        nl, nr = input_left.count(c), input_right.count(c)
+        if (nl > 1) or (nr > 1) or (nl + nr > 2):
+            return False
+
+        # can't do implicit summation or dimension collapse e.g.
+        #     "ab,bc->c" (implicitly sum over 'a')
+        #     "ab,ca->ca" (take diagonal of 'a')
+        if nl + nr - 1 == int(c in result):
+            return False
+
+    # Build a few temporaries
     set_left = set(input_left)
     set_right = set(input_right)
     keep_left = set_left - idx_removed
     keep_right = set_right - idx_removed
     rs = len(idx_removed)
 
-    # Indices must overlap between the two operands
-    if not len(set_left & set_right):
-        return False
-
-    # We cannot have duplicate indices ("ijj, jk -> ik")
-    if (len(set_left) != len(input_left)) or (len(set_right) != len(input_right)):
-        return False
-
-    # Cannot handle partial inner ("ij, ji -> i")
-    if len(keep_left & keep_right):
-        return False
-
     # At this point we are a DOT, GEMV, or GEMM operation
 
     # Handle inner products
@@ -371,6 +520,7 @@ def _can_dot(inputs, result, idx_removed):
     # We are a matrix-matrix product, but we need to copy data
     return True
 
+
 def _parse_einsum_input(operands):
     """
     A reproduction of einsum c side einsum parsing in python.
@@ -697,6 +847,7 @@ def einsum_path(*operands, **kwargs):
 
     # Get length of each unique dimension and ensure all dimensions are correct
     dimension_dict = {}
+    broadcast_indices = [[] for x in range(len(input_list))]
     for tnum, term in enumerate(input_list):
         sh = operands[tnum].shape
         if len(sh) != len(term):
@@ -705,6 +856,11 @@ def einsum_path(*operands, **kwargs):
                              % (input_subscripts[tnum], tnum))
         for cnum, char in enumerate(term):
             dim = sh[cnum]
+
+            # Build out broadcast indices
+            if dim == 1:
+                broadcast_indices[tnum].append(char)
+
             if char in dimension_dict.keys():
                 # For broadcasting cases we always want the largest dim size
                 if dimension_dict[char] == 1:
@@ -716,6 +872,9 @@ def einsum_path(*operands, **kwargs):
             else:
                 dimension_dict[char] = dim
 
+    # Convert broadcast inds to sets
+    broadcast_indices = [set(x) for x in broadcast_indices]
+
     # Compute size of each input array plus the output array
     size_list = []
     for term in input_list + [output_subscript]:
@@ -729,20 +888,14 @@ def einsum_path(*operands, **kwargs):
 
     # Compute naive cost
     # This isn't quite right, need to look into exactly how einsum does this
-    naive_cost = _compute_size_by_dict(indices, dimension_dict)
-    indices_in_input = input_subscripts.replace(',', '')
-    mult = max(len(input_list) - 1, 1)
-    if (len(indices_in_input) - len(set(indices_in_input))):
-        mult *= 2
-    naive_cost *= mult
+    inner_product = (sum(len(x) for x in input_sets) - len(indices)) > 0
+    naive_cost = _flop_count(indices, inner_product, len(input_list), dimension_dict)
 
     # Compute the path
     if (path_type is False) or (len(input_list) in [1, 2]) or (indices == output_set):
         # Nothing to be optimized, leave it to einsum
         path = [tuple(range(len(input_list)))]
     elif path_type == "greedy":
-        # Maximum memory should be at most out_size for this algorithm
-        memory_arg = min(memory_arg, max_size)
         path = _greedy_path(input_sets, output_set, dimension_dict, memory_arg)
     elif path_type == "optimal":
         path = _optimal_path(input_sets, output_set, dimension_dict, memory_arg)
@@ -761,18 +914,24 @@ def einsum_path(*operands, **kwargs):
         contract = _find_contraction(contract_inds, input_sets, output_set)
         out_inds, input_sets, idx_removed, idx_contract = contract
 
-        cost = _compute_size_by_dict(idx_contract, dimension_dict)
-        if idx_removed:
-            cost *= 2
+        cost = _flop_count(idx_contract, idx_removed, len(contract_inds), dimension_dict)
         cost_list.append(cost)
         scale_list.append(len(idx_contract))
         size_list.append(_compute_size_by_dict(out_inds, dimension_dict))
 
+        bcast = set()
         tmp_inputs = []
         for x in contract_inds:
             tmp_inputs.append(input_list.pop(x))
+            bcast |= broadcast_indices.pop(x)
 
-        do_blas = _can_dot(tmp_inputs, out_inds, idx_removed)
+        new_bcast_inds = bcast - idx_removed
+
+        # If we're broadcasting, nix blas
+        if not len(idx_removed & bcast):
+            do_blas = _can_dot(tmp_inputs, out_inds, idx_removed)
+        else:
+            do_blas = False
 
         # Last contraction
         if (cnum - len(path)) == -1:
@@ -782,6 +941,7 @@ def einsum_path(*operands, **kwargs):
             idx_result = "".join([x[1] for x in sorted(sort_result)])
 
         input_list.append(idx_result)
+        broadcast_indices.append(new_bcast_inds)
         einsum_str = ",".join(tmp_inputs) + "->" + idx_result
 
         contraction = (contract_inds, idx_removed, einsum_str, input_list[:], do_blas)
@@ -828,19 +988,27 @@ def einsum(*operands, **kwargs):
 
     Evaluates the Einstein summation convention on the operands.
 
-    Using the Einstein summation convention, many common multi-dimensional
-    array operations can be represented in a simple fashion.  This function
-    provides a way to compute such summations. The best way to understand this
-    function is to try the examples below, which show how many common NumPy
-    functions can be implemented as calls to `einsum`.
+    Using the Einstein summation convention, many common multi-dimensional,
+    linear algebraic array operations can be represented in a simple fashion.
+    In *implicit* mode `einsum` computes these values.
+
+    In *explicit* mode, `einsum` provides further flexibility to compute
+    other array operations that might not be considered classical Einstein
+    summation operations, by disabling, or forcing summation over specified
+    subscript labels.
+
+    See the notes and examples for clarification.
 
     Parameters
     ----------
     subscripts : str
-        Specifies the subscripts for summation.
+        Specifies the subscripts for summation as comma separated list of
+        subscript labels. An implicit (classical Einstein summation)
+        calculation is performed unless the explicit indicator '->' is
+        included as well as subscript labels of the precise output form.
     operands : list of array_like
         These are the arrays for the operation.
-    out : {ndarray, None}, optional
+    out : ndarray, optional
         If provided, the calculation is done into this array.
     dtype : {data-type, None}, optional
         If provided, forces the calculation to use the data type specified.
@@ -869,7 +1037,7 @@ def einsum(*operands, **kwargs):
         Controls if intermediate optimization should occur. No optimization
         will occur if False and True will default to the 'greedy' algorithm.
         Also accepts an explicit contraction list from the ``np.einsum_path``
-        function. See ``np.einsum_path`` for more details. Default is True.
+        function. See ``np.einsum_path`` for more details. Defaults to False.
 
     Returns
     -------
@@ -884,50 +1052,80 @@ def einsum(*operands, **kwargs):
     -----
     .. versionadded:: 1.6.0
 
-    The subscripts string is a comma-separated list of subscript labels,
-    where each label refers to a dimension of the corresponding operand.
-    Repeated subscripts labels in one operand take the diagonal.  For example,
-    ``np.einsum('ii', a)`` is equivalent to ``np.trace(a)``.
+    The Einstein summation convention can be used to compute
+    many multi-dimensional, linear algebraic array operations. `einsum`
+    provides a succinct way of representing these.
 
-    Whenever a label is repeated, it is summed, so ``np.einsum('i,i', a, b)``
-    is equivalent to ``np.inner(a,b)``.  If a label appears only once,
-    it is not summed, so ``np.einsum('i', a)`` produces a view of ``a``
-    with no changes.
+    A non-exhaustive list of these operations,
+    which can be computed by `einsum`, is shown below along with examples:
 
-    The order of labels in the output is by default alphabetical.  This
-    means that ``np.einsum('ij', a)`` doesn't affect a 2D array, while
-    ``np.einsum('ji', a)`` takes its transpose.
+    * Trace of an array, :py:func:`numpy.trace`.
+    * Return a diagonal, :py:func:`numpy.diag`.
+    * Array axis summations, :py:func:`numpy.sum`.
+    * Transpositions and permutations, :py:func:`numpy.transpose`.
+    * Matrix multiplication and dot product, :py:func:`numpy.matmul` :py:func:`numpy.dot`.
+    * Vector inner and outer products, :py:func:`numpy.inner` :py:func:`numpy.outer`.
+    * Broadcasting, element-wise and scalar multiplication, :py:func:`numpy.multiply`.
+    * Tensor contractions, :py:func:`numpy.tensordot`.
+    * Chained array operations, in efficient calculation order, :py:func:`numpy.einsum_path`.
 
-    The output can be controlled by specifying output subscript labels
-    as well.  This specifies the label order, and allows summing to
-    be disallowed or forced when desired.  The call ``np.einsum('i->', a)``
-    is like ``np.sum(a, axis=-1)``, and ``np.einsum('ii->i', a)``
-    is like ``np.diag(a)``.  The difference is that `einsum` does not
-    allow broadcasting by default.
+    The subscripts string is a comma-separated list of subscript labels,
+    where each label refers to a dimension of the corresponding operand.
+    Whenever a label is repeated it is summed, so ``np.einsum('i,i', a, b)``
+    is equivalent to :py:func:`np.inner(a,b) <numpy.inner>`. If a label
+    appears only once, it is not summed, so ``np.einsum('i', a)`` produces a
+    view of ``a`` with no changes. A further example ``np.einsum('ij,jk', a, b)``
+    describes traditional matrix multiplication and is equivalent to
+    :py:func:`np.matmul(a,b) <numpy.matmul>`. Repeated subscript labels in one
+    operand take the diagonal. For example, ``np.einsum('ii', a)`` is equivalent
+    to :py:func:`np.trace(a) <numpy.trace>`.
+
+    In *implicit mode*, the chosen subscripts are important
+    since the axes of the output are reordered alphabetically.  This
+    means that ``np.einsum('ij', a)`` doesn't affect a 2D array, while
+    ``np.einsum('ji', a)`` takes its transpose. Additionally,
+    ``np.einsum('ij,jk', a, b)`` returns a matrix multiplication, while,
+    ``np.einsum('ij,jh', a, b)`` returns the transpose of the
+    multiplication since subscript 'h' precedes subscript 'i'.
+
+    In *explicit mode* the output can be directly controlled by
+    specifying output subscript labels.  This requires the
+    identifier '->' as well as the list of output subscript labels.
+    This feature increases the flexibility of the function since
+    summing can be disabled or forced when required. The call
+    ``np.einsum('i->', a)`` is like :py:func:`np.sum(a, axis=-1) <numpy.sum>`,
+    and ``np.einsum('ii->i', a)`` is like :py:func:`np.diag(a) <numpy.diag>`.
+    The difference is that `einsum` does not allow broadcasting by default.
+    Additionally ``np.einsum('ij,jh->ih', a, b)`` directly specifies the
+    order of the output subscript labels and therefore returns matrix
+    multiplication, unlike the example above in implicit mode.
 
     To enable and control broadcasting, use an ellipsis.  Default
     NumPy-style broadcasting is done by adding an ellipsis
     to the left of each term, like ``np.einsum('...ii->...i', a)``.
     To take the trace along the first and last axes,
     you can do ``np.einsum('i...i', a)``, or to do a matrix-matrix
-    product with the left-most indices instead of rightmost, you can do
+    product with the left-most indices instead of rightmost, one can do
     ``np.einsum('ij...,jk...->ik...', a, b)``.
 
     When there is only one operand, no axes are summed, and no output
     parameter is provided, a view into the operand is returned instead
     of a new array.  Thus, taking the diagonal as ``np.einsum('ii->i', a)``
-    produces a view.
+    produces a view (changed in version 1.10.0).
 
-    An alternative way to provide the subscripts and operands is as
-    ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``. The examples
-    below have corresponding `einsum` calls with the two parameter methods.
+    `einsum` also provides an alternative way to provide the subscripts
+    and operands as ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``.
+    If the output shape is not provided in this format `einsum` will be
+    calculated in implicit mode, otherwise it will be performed explicitly.
+    The examples below have corresponding `einsum` calls with the two
+    parameter methods.
 
     .. versionadded:: 1.10.0
 
     Views returned from einsum are now writeable whenever the input array
     is writeable. For example, ``np.einsum('ijk...->kji...', a)`` will now
-    have the same effect as ``np.swapaxes(a, 0, 2)`` and
-    ``np.einsum('ii->i', a)`` will return a writeable view of the diagonal
+    have the same effect as :py:func:`np.swapaxes(a, 0, 2) <numpy.swapaxes>`
+    and ``np.einsum('ii->i', a)`` will return a writeable view of the diagonal
     of a 2D array.
 
     .. versionadded:: 1.12.0
@@ -937,7 +1135,14 @@ def einsum(*operands, **kwargs):
     can greatly increase the computational efficiency at the cost of a larger
     memory footprint during computation.
 
-    See ``np.einsum_path`` for more details.
+    Typically a 'greedy' algorithm is applied which empirical tests have shown
+    returns the optimal path in the majority of cases. In some cases 'optimal'
+    will return the superlative path through a more expensive, exhaustive search.
+    For iterative calculations it may be advisable to calculate the optimal path
+    once and reuse that path by supplying it as an argument. An example is given
+    below.
+
+    See :py:func:`numpy.einsum_path` for more details.
 
     Examples
     --------
@@ -945,6 +1150,8 @@ def einsum(*operands, **kwargs):
     >>> b = np.arange(5)
     >>> c = np.arange(6).reshape(2,3)
 
+    Trace of a matrix:
+
     >>> np.einsum('ii', a)
     60
     >>> np.einsum(a, [0,0])
@@ -952,6 +1159,8 @@ def einsum(*operands, **kwargs):
     >>> np.trace(a)
     60
 
+    Extract the diagonal (requires explicit form):
+
     >>> np.einsum('ii->i', a)
     array([ 0,  6, 12, 18, 24])
     >>> np.einsum(a, [0,0], [0])
@@ -959,32 +1168,67 @@ def einsum(*operands, **kwargs):
     >>> np.diag(a)
     array([ 0,  6, 12, 18, 24])
 
-    >>> np.einsum('ij,j', a, b)
-    array([ 30,  80, 130, 180, 230])
-    >>> np.einsum(a, [0,1], b, [1])
-    array([ 30,  80, 130, 180, 230])
-    >>> np.dot(a, b)
-    array([ 30,  80, 130, 180, 230])
-    >>> np.einsum('...j,j', a, b)
-    array([ 30,  80, 130, 180, 230])
+    Sum over an axis (requires explicit form):
+
+    >>> np.einsum('ij->i', a)
+    array([ 10,  35,  60,  85, 110])
+    >>> np.einsum(a, [0,1], [0])
+    array([ 10,  35,  60,  85, 110])
+    >>> np.sum(a, axis=1)
+    array([ 10,  35,  60,  85, 110])
+
+    For higher dimensional arrays summing a single axis can be done with ellipsis:
+
+    >>> np.einsum('...j->...', a)
+    array([ 10,  35,  60,  85, 110])
+    >>> np.einsum(a, [Ellipsis,1], [Ellipsis])
+    array([ 10,  35,  60,  85, 110])
+
+    Compute a matrix transpose, or reorder any number of axes:
 
     >>> np.einsum('ji', c)
     array([[0, 3],
            [1, 4],
            [2, 5]])
+    >>> np.einsum('ij->ji', c)
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
     >>> np.einsum(c, [1,0])
     array([[0, 3],
            [1, 4],
            [2, 5]])
-    >>> c.T
+    >>> np.transpose(c)
     array([[0, 3],
            [1, 4],
            [2, 5]])
 
+    Vector inner products:
+
+    >>> np.einsum('i,i', b, b)
+    30
+    >>> np.einsum(b, [0], b, [0])
+    30
+    >>> np.inner(b,b)
+    30
+
+    Matrix vector multiplication:
+
+    >>> np.einsum('ij,j', a, b)
+    array([ 30,  80, 130, 180, 230])
+    >>> np.einsum(a, [0,1], b, [1])
+    array([ 30,  80, 130, 180, 230])
+    >>> np.dot(a, b)
+    array([ 30,  80, 130, 180, 230])
+    >>> np.einsum('...j,j', a, b)
+    array([ 30,  80, 130, 180, 230])
+
+    Broadcasting and scalar multiplication:
+
     >>> np.einsum('..., ...', 3, c)
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
-    >>> np.einsum(',ij', 3, C)
+    >>> np.einsum(',ij', 3, c)
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
     >>> np.einsum(3, [Ellipsis], c, [Ellipsis])
@@ -994,12 +1238,7 @@ def einsum(*operands, **kwargs):
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
 
-    >>> np.einsum('i,i', b, b)
-    30
-    >>> np.einsum(b, [0], b, [0])
-    30
-    >>> np.inner(b,b)
-    30
+    Vector outer product:
 
     >>> np.einsum('i,j', np.arange(2)+1, b)
     array([[0, 1, 2, 3, 4],
@@ -1011,12 +1250,7 @@ def einsum(*operands, **kwargs):
     array([[0, 1, 2, 3, 4],
            [0, 2, 4, 6, 8]])
 
-    >>> np.einsum('i...->...', a)
-    array([50, 55, 60, 65, 70])
-    >>> np.einsum(a, [0,Ellipsis], [Ellipsis])
-    array([50, 55, 60, 65, 70])
-    >>> np.sum(a, axis=0)
-    array([50, 55, 60, 65, 70])
+    Tensor contraction:
 
     >>> a = np.arange(60.).reshape(3,4,5)
     >>> b = np.arange(24.).reshape(4,3,2)
@@ -1039,6 +1273,17 @@ def einsum(*operands, **kwargs):
            [ 4796.,  5162.],
            [ 4928.,  5306.]])
 
+    Writeable returned arrays (since version 1.10.0):
+
+    >>> a = np.zeros((3, 3))
+    >>> np.einsum('ii->i', a)[:] = 1
+    >>> a
+    array([[ 1.,  0.,  0.],
+           [ 0.,  1.,  0.],
+           [ 0.,  0.,  1.]])
+
+    Example of ellipsis use:
+
     >>> a = np.arange(6).reshape((3,2))
     >>> b = np.arange(12).reshape((4,3))
     >>> np.einsum('ki,jk->ij', a, b)
@@ -1051,13 +1296,26 @@ def einsum(*operands, **kwargs):
     array([[10, 28, 46, 64],
            [13, 40, 67, 94]])
 
-    >>> # since version 1.10.0
-    >>> a = np.zeros((3, 3))
-    >>> np.einsum('ii->i', a)[:] = 1
-    >>> a
-    array([[ 1.,  0.,  0.],
-           [ 0.,  1.,  0.],
-           [ 0.,  0.,  1.]])
+    Chained array operations. For more complicated contractions, speed ups
+    might be achieved by repeatedly computing a 'greedy' path or pre-computing the
+    'optimal' path and repeatedly applying it, using an
+    `einsum_path` insertion (since version 1.12.0). Performance improvements can be
+    particularly significant with larger arrays:
+
+    >>> a = np.ones(64).reshape(2,4,8)
+    # Basic `einsum`: ~1520ms  (benchmarked on 3.1GHz Intel i5.)
+    >>> for iteration in range(500):
+    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a)
+    # Sub-optimal `einsum` (due to repeated path calculation time): ~330ms
+    >>> for iteration in range(500):
+    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')
+    # Greedy `einsum` (faster optimal path approximation): ~160ms
+    >>> for iteration in range(500):
+    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='greedy')
+    # Optimal `einsum` (best usage pattern in some use cases): ~110ms
+    >>> path = np.einsum_path('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')[0]
+    >>> for iteration in range(500):
+    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize=path)
 
     """
 
@@ -1101,25 +1359,14 @@ def einsum(*operands, **kwargs):
             tmp_operands.append(operands.pop(x))
 
         # Do we need to deal with the output?
-        if specified_out and ((num + 1) == len(contraction_list)):
-            handle_out = True
+        handle_out = specified_out and ((num + 1) == len(contraction_list))
 
-        # Handle broadcasting vs BLAS cases
+        # Call tensordot if still possible
         if blas:
             # Checks have already been handled
             input_str, results_index = einsum_str.split('->')
             input_left, input_right = input_str.split(',')
-            if 1 in tmp_operands[0].shape or 1 in tmp_operands[1].shape:
-                left_dims = {dim: size for dim, size in
-                             zip(input_left, tmp_operands[0].shape)}
-                right_dims = {dim: size for dim, size in
-                              zip(input_right, tmp_operands[1].shape)}
-                # If dims do not match we are broadcasting, BLAS off
-                if any(left_dims[ind] != right_dims[ind] for ind in idx_rm):
-                    blas = False
 
-        # Call tensordot if still possible
-        if blas:
             tensor_result = input_left + input_right
             for s in idx_rm:
                 tensor_result = tensor_result.replace(s, "")
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 5b67a0dc5..b9cc98cae 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -1198,6 +1198,16 @@ def resize(a, new_shape):
     --------
     ndarray.resize : resize an array in-place.
 
+    Notes
+    -----
+    Warning: This functionality does **not** consider axes separately,
+    i.e. it does not apply interpolation/extrapolation.
+    It fills the return array with the required number of elements, taken
+    from `a` as they are laid out in memory, disregarding strides and axes.
+    (This is in case the new shape is smaller. For larger, see above.)
+    This functionality is therefore not suitable to resize images,
+    or data where each axis represents a separate and distinct entity.
+
     Examples
     --------
     >>> a=np.array([[0,1],[2,3]])
@@ -1615,16 +1625,16 @@ def nonzero(a):
 
     Examples
     --------
-    >>> x = np.array([[1,0,0], [0,2,0], [1,1,0]])
+    >>> x = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]])
     >>> x
-    array([[1, 0, 0],
-           [0, 2, 0],
-           [1, 1, 0]])
+    array([[3, 0, 0],
+           [0, 4, 0],
+           [5, 6, 0]])
     >>> np.nonzero(x)
     (array([0, 1, 2, 2]), array([0, 1, 0, 1]))
 
     >>> x[np.nonzero(x)]
-    array([1, 2, 1, 1])
+    array([3, 4, 5, 6])
     >>> np.transpose(np.nonzero(x))
     array([[0, 0],
            [1, 1],
@@ -1636,7 +1646,7 @@ def nonzero(a):
     boolean array and since False is interpreted as 0, np.nonzero(a > 3)
     yields the indices of the `a` where the condition is true.
 
-    >>> a = np.array([[1,2,3],[4,5,6],[7,8,9]])
+    >>> a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     >>> a > 3
     array([[False, False, False],
            [ True,  True,  True],
@@ -1644,7 +1654,14 @@ def nonzero(a):
     >>> np.nonzero(a > 3)
     (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2]))
 
-    The ``nonzero`` method of the boolean array can also be called.
+    Using this result to index `a` is equivalent to using the mask directly:
+
+    >>> a[np.nonzero(a > 3)]
+    array([4, 5, 6, 7, 8, 9])
+    >>> a[a > 3]  # prefer this spelling
+    array([4, 5, 6, 7, 8, 9])
+
+    ``nonzero`` can also be called as a method of the array.
 
     >>> (a > 3).nonzero()
     (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2]))
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index 82de1a36e..fb72bada5 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -6,6 +6,7 @@ import operator
 from . import numeric as _nx
 from .numeric import (result_type, NaN, shares_memory, MAY_SHARE_BOUNDS,
                       TooHardError,asanyarray)
+from numpy.core.multiarray import add_docstring
 
 __all__ = ['logspace', 'linspace', 'geomspace']
 
@@ -356,3 +357,38 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None):
                                  endpoint=endpoint, base=10.0, dtype=dtype)
 
     return result.astype(dtype)
+
+
+#always succeed
+def add_newdoc(place, obj, doc):
+    """
+    Adds documentation to obj which is in module place.
+
+    If doc is a string add it to obj as a docstring
+
+    If doc is a tuple, then the first element is interpreted as
+       an attribute of obj and the second as the docstring
+          (method, docstring)
+
+    If doc is a list, then each element of the list should be a
+       sequence of length two --> [(method1, docstring1),
+       (method2, docstring2), ...]
+
+    This routine never raises an error.
+
+    This routine cannot modify read-only docstrings, as appear
+    in new-style classes or built-in functions. Because this
+    routine never raises an error the caller must check manually
+    that the docstrings were changed.
+    """
+    try:
+        new = getattr(__import__(place, globals(), {}, [obj]), obj)
+        if isinstance(doc, str):
+            add_docstring(new, doc.strip())
+        elif isinstance(doc, tuple):
+            add_docstring(getattr(new, doc[0]), doc[1].strip())
+        elif isinstance(doc, list):
+            for val in doc:
+                add_docstring(getattr(new, val[0]), val[1].strip())
+    except Exception:
+        pass
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index 106ffa450..c712fd3ef 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -39,17 +39,19 @@
      * _M_AMD64 defined by MS compiler
      */
     #define NPY_CPU_AMD64
+#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__)
+    #define NPY_CPU_PPC64LE
+#elif defined(__powerpc64__) && defined(__BIG_ENDIAN__)
+    #define NPY_CPU_PPC64
 #elif defined(__ppc__) || defined(__powerpc__) || defined(_ARCH_PPC)
     /*
      * __ppc__ is defined by gcc, I remember having seen __powerpc__ once,
      * but can't find it ATM
      * _ARCH_PPC is used by at least gcc on AIX
+     * As __powerpc__ and _ARCH_PPC are also defined by PPC64 check
+     * for those specifically first before defaulting to ppc
      */
     #define NPY_CPU_PPC
-#elif defined(__ppc64le__)
-    #define NPY_CPU_PPC64LE
-#elif defined(__ppc64__)
-    #define NPY_CPU_PPC64
 #elif defined(__sparc__) || defined(__sparc)
     /* __sparc__ is defined by gcc and Forte (e.g. Sun) compilers */
     #define NPY_CPU_SPARC
@@ -61,10 +63,27 @@
     #define NPY_CPU_HPPA
 #elif defined(__alpha__)
     #define NPY_CPU_ALPHA
-#elif defined(__arm__) && defined(__ARMEL__)
-    #define NPY_CPU_ARMEL
-#elif defined(__arm__) && defined(__ARMEB__)
-    #define NPY_CPU_ARMEB
+#elif defined(__arm__)
+    #if defined(__ARMEB__)
+        #if defined(__ARM_32BIT_STATE)
+            #define NPY_CPU_ARMEB_AARCH32
+        #elif defined(__ARM_64BIT_STATE)
+            #define NPY_CPU_ARMEB_AARCH64
+        #else
+            #define NPY_CPU_ARMEB
+        #endif
+    #elif defined(__ARMEL__)
+        #if defined(__ARM_32BIT_STATE)
+            #define NPY_CPU_ARMEL_AARCH32
+        #elif defined(__ARM_64BIT_STATE)
+            #define NPY_CPU_ARMEL_AARCH64
+        #else
+            #define NPY_CPU_ARMEL
+        #endif
+    #else
+        # error Unknown ARM CPU, please report this to numpy maintainers with \
+	information about your platform (OS, CPU and compiler)
+    #endif
 #elif defined(__sh__) && defined(__LITTLE_ENDIAN__)
     #define NPY_CPU_SH_LE
 #elif defined(__sh__) && defined(__BIG_ENDIAN__)
@@ -75,8 +94,6 @@
     #define NPY_CPU_MIPSEB
 #elif defined(__or1k__)
     #define NPY_CPU_OR1K
-#elif defined(__aarch64__)
-    #define NPY_CPU_AARCH64
 #elif defined(__mc68000__)
     #define NPY_CPU_M68K
 #elif defined(__arc__) && defined(__LITTLE_ENDIAN__)
diff --git a/numpy/core/include/numpy/npy_endian.h b/numpy/core/include/numpy/npy_endian.h
index 649bdb0a6..44cdffd14 100644
--- a/numpy/core/include/numpy/npy_endian.h
+++ b/numpy/core/include/numpy/npy_endian.h
@@ -37,28 +37,31 @@
     #define NPY_LITTLE_ENDIAN 1234
     #define NPY_BIG_ENDIAN 4321
 
-    #if defined(NPY_CPU_X86)            \
-            || defined(NPY_CPU_AMD64)   \
-            || defined(NPY_CPU_IA64)    \
-            || defined(NPY_CPU_ALPHA)   \
-            || defined(NPY_CPU_ARMEL)   \
-            || defined(NPY_CPU_AARCH64) \
-            || defined(NPY_CPU_SH_LE)   \
-            || defined(NPY_CPU_MIPSEL)  \
-            || defined(NPY_CPU_PPC64LE) \
-            || defined(NPY_CPU_ARCEL)   \
+    #if defined(NPY_CPU_X86)                  \
+            || defined(NPY_CPU_AMD64)         \
+            || defined(NPY_CPU_IA64)          \
+            || defined(NPY_CPU_ALPHA)         \
+            || defined(NPY_CPU_ARMEL)         \
+            || defined(NPY_CPU_ARMEL_AARCH32) \
+            || defined(NPY_CPU_ARMEL_AARCH64) \
+            || defined(NPY_CPU_SH_LE)         \
+            || defined(NPY_CPU_MIPSEL)        \
+            || defined(NPY_CPU_PPC64LE)       \
+            || defined(NPY_CPU_ARCEL)         \
             || defined(NPY_CPU_RISCV64)
         #define NPY_BYTE_ORDER NPY_LITTLE_ENDIAN
-    #elif defined(NPY_CPU_PPC)          \
-            || defined(NPY_CPU_SPARC)   \
-            || defined(NPY_CPU_S390)    \
-            || defined(NPY_CPU_HPPA)    \
-            || defined(NPY_CPU_PPC64)   \
-            || defined(NPY_CPU_ARMEB)   \
-            || defined(NPY_CPU_SH_BE)   \
-            || defined(NPY_CPU_MIPSEB)  \
-            || defined(NPY_CPU_OR1K)    \
-            || defined(NPY_CPU_M68K)    \
+    #elif defined(NPY_CPU_PPC)                \
+            || defined(NPY_CPU_SPARC)         \
+            || defined(NPY_CPU_S390)          \
+            || defined(NPY_CPU_HPPA)          \
+            || defined(NPY_CPU_PPC64)         \
+            || defined(NPY_CPU_ARMEB)         \
+            || defined(NPY_CPU_ARMEB_AARCH32) \
+            || defined(NPY_CPU_ARMEB_AARCH64) \
+            || defined(NPY_CPU_SH_BE)         \
+            || defined(NPY_CPU_MIPSEB)        \
+            || defined(NPY_CPU_OR1K)          \
+            || defined(NPY_CPU_M68K)          \
             || defined(NPY_CPU_ARCEB)
         #define NPY_BYTE_ORDER NPY_BIG_ENDIAN
     #else
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 106f0ccfe..e5570791a 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1509,11 +1509,14 @@ def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False):
     --------
     normalize_axis_index : normalizing a single scalar axis
     """
-    try:
-        axis = [operator.index(axis)]
-    except TypeError:
-        axis = tuple(axis)
-    axis = tuple(normalize_axis_index(ax, ndim, argname) for ax in axis)
+    # Optimization to speed-up the most common cases.
+    if type(axis) not in (tuple, list):
+        try:
+            axis = [operator.index(axis)]
+        except TypeError:
+            pass
+    # Going via an iterator directly is slower than via list comprehension.
+    axis = tuple([normalize_axis_index(ax, ndim, argname) for ax in axis])
     if not allow_duplicate and len(set(axis)) != len(axis):
         if argname:
             raise ValueError('repeated axis in `{}` argument'.format(argname))
@@ -1897,7 +1900,7 @@ def fromfunction(function, shape, **kwargs):
         The result of the call to `function` is passed back directly.
         Therefore the shape of `fromfunction` is completely determined by
         `function`.  If `function` returns a scalar value, the shape of
-        `fromfunction` would match the `shape` parameter.
+        `fromfunction` would not match the `shape` parameter.
 
     See Also
     --------
diff --git a/numpy/core/src/multiarray/cblasfuncs.c b/numpy/core/src/multiarray/cblasfuncs.c
index c941bb29b..6460c5db1 100644
--- a/numpy/core/src/multiarray/cblasfuncs.c
+++ b/numpy/core/src/multiarray/cblasfuncs.c
@@ -12,32 +12,6 @@
 #include "npy_cblas.h"
 #include "arraytypes.h"
 #include "common.h"
-#include "mem_overlap.h"
-
-
-/*
- * Helper: call appropriate BLAS dot function for typenum.
- * Strides are NumPy strides.
- */
-static void
-blas_dot(int typenum, npy_intp n,
-         void *a, npy_intp stridea, void *b, npy_intp strideb, void *res)
-{
-    switch (typenum) {
-        case NPY_DOUBLE:
-            DOUBLE_dot(a, stridea, b, strideb, res, n, NULL);
-            break;
-        case NPY_FLOAT:
-            FLOAT_dot(a, stridea, b, strideb, res, n, NULL);
-            break;
-        case NPY_CDOUBLE:
-            CDOUBLE_dot(a, stridea, b, strideb, res, n, NULL);
-            break;
-        case NPY_CFLOAT:
-            CFLOAT_dot(a, stridea, b, strideb, res, n, NULL);
-            break;
-    }
-}
 
 
 static const double oneD[2] = {1.0, 0.0}, zeroD[2] = {0.0, 0.0};
@@ -227,6 +201,7 @@ _bad_strides(PyArrayObject *ap)
     return 0;
 }
 
+
 /*
  * dot(a,b)
  * Returns the dot product of a and b for arrays of floating point types.
@@ -379,77 +354,9 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
         }
     }
 
-    if (out != NULL) {
-        int d;
-
-        /* verify that out is usable */
-        if (PyArray_NDIM(out) != nd ||
-            PyArray_TYPE(out) != typenum ||
-            !PyArray_ISCARRAY(out)) {
-
-            PyErr_SetString(PyExc_ValueError,
-                "output array is not acceptable (must have the right datatype, "
-                "number of dimensions, and be a C-Array)");
-            goto fail;
-        }
-        for (d = 0; d < nd; ++d) {
-            if (dimensions[d] != PyArray_DIM(out, d)) {
-                PyErr_SetString(PyExc_ValueError,
-                    "output array has wrong dimensions");
-                goto fail;
-            }
-        }
-
-        /* check for memory overlap */
-        if (!(solve_may_share_memory(out, ap1, 1) == 0 &&
-              solve_may_share_memory(out, ap2, 1) == 0)) {
-            /* allocate temporary output array */
-            out_buf = (PyArrayObject *)PyArray_NewLikeArray(out, NPY_CORDER,
-                                                            NULL, 0);
-            if (out_buf == NULL) {
-                goto fail;
-            }
-
-            /* set copy-back */
-            Py_INCREF(out);
-            if (PyArray_SetWritebackIfCopyBase(out_buf, out) < 0) {
-                Py_DECREF(out);
-                goto fail;
-            }
-        }
-        else {
-            Py_INCREF(out);
-            out_buf = out;
-        }
-        Py_INCREF(out);
-        result = out;
-    }
-    else {
-        double prior1, prior2;
-        PyTypeObject *subtype;
-        PyObject *tmp;
-
-        /* Choose which subtype to return */
-        if (Py_TYPE(ap1) != Py_TYPE(ap2)) {
-            prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
-            prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
-            subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
-        }
-        else {
-            prior1 = prior2 = 0.0;
-            subtype = Py_TYPE(ap1);
-        }
-
-        tmp = (PyObject *)(prior2 > prior1 ? ap2 : ap1);
-
-        out_buf = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
-                                               typenum, NULL, NULL, 0, 0, tmp);
-        if (out_buf == NULL) {
-            goto fail;
-        }
-
-        Py_INCREF(out_buf);
-        result = out_buf;
+    out_buf = new_array_for_sum(ap1, ap2, out, nd, dimensions, typenum, &result);
+    if (out_buf == NULL) {
+        goto fail;
     }
 
     numbytes = PyArray_NBYTES(out_buf);
@@ -617,10 +524,10 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
         NPY_BEGIN_ALLOW_THREADS;
 
         /* Dot product between two vectors -- Level 1 BLAS */
-        blas_dot(typenum, l,
+        PyArray_DESCR(out_buf)->f->dotfunc(
                  PyArray_DATA(ap1), PyArray_STRIDE(ap1, (ap1shape == _row)),
                  PyArray_DATA(ap2), PyArray_STRIDE(ap2, 0),
-                 PyArray_DATA(out_buf));
+                 PyArray_DATA(out_buf), l, NULL);
         NPY_END_ALLOW_THREADS;
     }
     else if (ap1shape == _matrix && ap2shape != _matrix) {
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index c70f8526e..4f695fdc7 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -15,6 +15,7 @@
 #include "buffer.h"
 
 #include "get_attr_string.h"
+#include "mem_overlap.h"
 
 /*
  * The casting to use for implicit assignment operations resulting from
@@ -852,3 +853,102 @@ _may_have_objects(PyArray_Descr *dtype)
     return (PyDataType_HASFIELDS(base) ||
             PyDataType_FLAGCHK(base, NPY_ITEM_HASOBJECT) );
 }
+
+/*
+ * Make a new empty array, of the passed size, of a type that takes the
+ * priority of ap1 and ap2 into account.
+ *
+ * If `out` is non-NULL, memory overlap is checked with ap1 and ap2, and an
+ * updateifcopy temporary array may be returned. If `result` is non-NULL, the
+ * output array to be returned (`out` if non-NULL and the newly allocated array
+ * otherwise) is incref'd and put to *result.
+ */
+NPY_NO_EXPORT PyArrayObject *
+new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
+                  int nd, npy_intp dimensions[], int typenum, PyArrayObject **result)
+{
+    PyArrayObject *out_buf;
+
+    if (out) {
+        int d;
+
+        /* verify that out is usable */
+        if (PyArray_NDIM(out) != nd ||
+            PyArray_TYPE(out) != typenum ||
+            !PyArray_ISCARRAY(out)) {
+            PyErr_SetString(PyExc_ValueError,
+                "output array is not acceptable (must have the right datatype, "
+                "number of dimensions, and be a C-Array)");
+            return 0;
+        }
+        for (d = 0; d < nd; ++d) {
+            if (dimensions[d] != PyArray_DIM(out, d)) {
+                PyErr_SetString(PyExc_ValueError,
+                    "output array has wrong dimensions");
+                return 0;
+            }
+        }
+
+        /* check for memory overlap */
+        if (!(solve_may_share_memory(out, ap1, 1) == 0 &&
+              solve_may_share_memory(out, ap2, 1) == 0)) {
+            /* allocate temporary output array */
+            out_buf = (PyArrayObject *)PyArray_NewLikeArray(out, NPY_CORDER,
+                                                            NULL, 0);
+            if (out_buf == NULL) {
+                return NULL;
+            }
+
+            /* set copy-back */
+            Py_INCREF(out);
+            if (PyArray_SetWritebackIfCopyBase(out_buf, out) < 0) {
+                Py_DECREF(out);
+                Py_DECREF(out_buf);
+                return NULL;
+            }
+        }
+        else {
+            Py_INCREF(out);
+            out_buf = out;
+        }
+
+        if (result) {
+            Py_INCREF(out);
+            *result = out;
+        }
+
+        return out_buf;
+    }
+    else {
+        PyTypeObject *subtype;
+        double prior1, prior2;
+        /*
+         * Need to choose an output array that can hold a sum
+         * -- use priority to determine which subtype.
+         */
+        if (Py_TYPE(ap2) != Py_TYPE(ap1)) {
+            prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
+            prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
+            subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
+        }
+        else {
+            prior1 = prior2 = 0.0;
+            subtype = Py_TYPE(ap1);
+        }
+
+        out_buf = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
+                                               typenum, NULL, NULL, 0, 0,
+                                               (PyObject *)
+                                               (prior2 > prior1 ? ap2 : ap1));
+
+        if (out_buf != NULL && result) {
+            Py_INCREF(out_buf);
+            *result = out_buf;
+        }
+
+        return out_buf;
+    }
+}
+
+
+
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index ae9b960c8..db0a49920 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -283,4 +283,17 @@ blas_stride(npy_intp stride, unsigned itemsize)
 
 #include "ucsnarrow.h"
 
+/*
+ * Make a new empty array, of the passed size, of a type that takes the
+ * priority of ap1 and ap2 into account.
+ *
+ * If `out` is non-NULL, memory overlap is checked with ap1 and ap2, and an
+ * updateifcopy temporary array may be returned. If `result` is non-NULL, the
+ * output array to be returned (`out` if non-NULL and the newly allocated array
+ * otherwise) is incref'd and put to *result.
+ */
+NPY_NO_EXPORT PyArrayObject *
+new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
+                  int nd, npy_intp dimensions[], int typenum, PyArrayObject **result);
+
 #endif
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index bcb44f6d1..1c27f8394 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -21,11 +21,17 @@
  * and 0 if the array is not monotonic.
  */
 static int
-check_array_monotonic(const double *a, npy_int lena)
+check_array_monotonic(const double *a, npy_intp lena)
 {
     npy_intp i;
     double next;
-    double last = a[0];
+    double last;
+
+    if (lena == 0) {
+        /* all bin edges hold the same value */
+        return 1;
+    }
+    last = a[0];
 
     /* Skip repeated values at the beginning of the array */
     for (i = 1; (i < lena) && (a[i] == last); i++);
@@ -209,106 +215,41 @@ fail:
     return NULL;
 }
 
-/*
- * digitize(x, bins, right=False) returns an array of integers the same length
- * as x. The values i returned are such that bins[i - 1] <= x < bins[i] if
- * bins is monotonically increasing, or bins[i - 1] > x >= bins[i] if bins
- * is monotonically decreasing.  Beyond the bounds of bins, returns either
- * i = 0 or i = len(bins) as appropriate. If right == True the comparison
- * is bins [i - 1] < x <= bins[i] or bins [i - 1] >= x > bins[i]
- */
+/* Internal function to expose check_array_monotonic to python */
 NPY_NO_EXPORT PyObject *
-arr_digitize(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+arr__monotonicity(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 {
+    static char *kwlist[] = {"x", NULL};
     PyObject *obj_x = NULL;
-    PyObject *obj_bins = NULL;
     PyArrayObject *arr_x = NULL;
-    PyArrayObject *arr_bins = NULL;
-    PyObject *ret = NULL;
-    npy_intp len_bins;
-    int monotonic, right = 0;
-    NPY_BEGIN_THREADS_DEF
-
-    static char *kwlist[] = {"x", "bins", "right", NULL};
+    long monotonic;
+    npy_intp len_x;
+    NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|i:digitize", kwlist,
-                                     &obj_x, &obj_bins, &right)) {
-        goto fail;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|_monotonicity", kwlist,
+                                     &obj_x)) {
+        return NULL;
     }
 
-    /* PyArray_SearchSorted will make `x` contiguous even if we don't */
-    arr_x = (PyArrayObject *)PyArray_FROMANY(obj_x, NPY_DOUBLE, 0, 0,
-                                             NPY_ARRAY_CARRAY_RO);
+    /*
+     * TODO:
+     *  `x` could be strided, needs change to check_array_monotonic
+     *  `x` is forced to double for this check
+     */
+    arr_x = (PyArrayObject *)PyArray_FROMANY(
+        obj_x, NPY_DOUBLE, 1, 1, NPY_ARRAY_CARRAY_RO);
     if (arr_x == NULL) {
-        goto fail;
-    }
-
-    /* TODO: `bins` could be strided, needs change to check_array_monotonic */
-    arr_bins = (PyArrayObject *)PyArray_FROMANY(obj_bins, NPY_DOUBLE, 1, 1,
-                                               NPY_ARRAY_CARRAY_RO);
-    if (arr_bins == NULL) {
-        goto fail;
-    }
-
-    len_bins = PyArray_SIZE(arr_bins);
-    if (len_bins == 0) {
-        PyErr_SetString(PyExc_ValueError, "bins must have non-zero length");
-        goto fail;
+        return NULL;
     }
 
-    NPY_BEGIN_THREADS_THRESHOLDED(len_bins)
-    monotonic = check_array_monotonic((const double *)PyArray_DATA(arr_bins),
-                                      len_bins);
+    len_x = PyArray_SIZE(arr_x);
+    NPY_BEGIN_THREADS_THRESHOLDED(len_x)
+    monotonic = check_array_monotonic(
+        (const double *)PyArray_DATA(arr_x), len_x);
     NPY_END_THREADS
+    Py_DECREF(arr_x);
 
-    if (monotonic == 0) {
-        PyErr_SetString(PyExc_ValueError,
-                        "bins must be monotonically increasing or decreasing");
-        goto fail;
-    }
-
-    /* PyArray_SearchSorted needs an increasing array */
-    if (monotonic == - 1) {
-        PyArrayObject *arr_tmp = NULL;
-        npy_intp shape = PyArray_DIM(arr_bins, 0);
-        npy_intp stride = -PyArray_STRIDE(arr_bins, 0);
-        void *data = (void *)(PyArray_BYTES(arr_bins) - stride * (shape - 1));
-
-        arr_tmp = (PyArrayObject *)PyArray_NewFromDescrAndBase(
-                &PyArray_Type, PyArray_DescrFromType(NPY_DOUBLE),
-                1, &shape, &stride, data,
-                PyArray_FLAGS(arr_bins), NULL, (PyObject *)arr_bins);
-        Py_DECREF(arr_bins);
-        if (!arr_tmp) {
-            goto fail;
-        }
-        arr_bins = arr_tmp;
-    }
-
-    ret = PyArray_SearchSorted(arr_bins, (PyObject *)arr_x,
-                               right ? NPY_SEARCHLEFT : NPY_SEARCHRIGHT, NULL);
-    if (!ret) {
-        goto fail;
-    }
-
-    /* If bins is decreasing, ret has bins from end, not start */
-    if (monotonic == -1) {
-        npy_intp *ret_data =
-                        (npy_intp *)PyArray_DATA((PyArrayObject *)ret);
-        npy_intp len_ret = PyArray_SIZE((PyArrayObject *)ret);
-
-        NPY_BEGIN_THREADS_THRESHOLDED(len_ret)
-        while (len_ret--) {
-            *ret_data = len_bins - *ret_data;
-            ret_data++;
-        }
-        NPY_END_THREADS
-    }
-
-    fail:
-        Py_XDECREF(arr_x);
-        Py_XDECREF(arr_bins);
-        return ret;
+    return PyInt_FromLong(monotonic);
 }
 
 /*
@@ -654,6 +595,10 @@ arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
             else if (j == lenxp - 1) {
                 dres[i] = dy[j];
             }
+            else if (dx[j] == x_val) {
+                /* Avoid potential non-finite interpolation */
+                dres[i] = dy[j];
+            }
             else {
                 const npy_double slope = (slopes != NULL) ? slopes[j] :
                                          (dy[j+1] - dy[j]) / (dx[j+1] - dx[j]);
@@ -822,6 +767,10 @@ arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
             else if (j == lenxp - 1) {
                 dres[i] = dy[j];
             }
+            else if (dx[j] == x_val) {
+                /* Avoid potential non-finite interpolation */
+                dres[i] = dy[j];
+            }
             else {
                 if (slopes!=NULL) {
                     dres[i].real = slopes[j].real*(x_val - dx[j]) + dy[j].real;
diff --git a/numpy/core/src/multiarray/compiled_base.h b/numpy/core/src/multiarray/compiled_base.h
index 51508531c..082139910 100644
--- a/numpy/core/src/multiarray/compiled_base.h
+++ b/numpy/core/src/multiarray/compiled_base.h
@@ -7,7 +7,7 @@ arr_insert(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr_bincount(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_digitize(PyObject *, PyObject *, PyObject *kwds);
+arr__monotonicity(PyObject *, PyObject *, PyObject *kwds);
 NPY_NO_EXPORT PyObject *
 arr_interp(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 33184d99a..1765982a0 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -2767,11 +2767,11 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         goto fail;
     }
 
-    /* Initialize the output to all zeros and reset the iterator */
+    /* Initialize the output to all zeros */
     ret = NpyIter_GetOperandArray(iter)[nop];
-    Py_INCREF(ret);
-    PyArray_AssignZero(ret, NULL);
-
+    if (PyArray_AssignZero(ret, NULL) < 0) {
+        goto fail;
+    }
 
     /***************************/
     /*
@@ -2785,16 +2785,12 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
             case 1:
                 if (ndim == 2) {
                     if (unbuffered_loop_nop1_ndim2(iter) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
                         goto fail;
                     }
                     goto finish;
                 }
                 else if (ndim == 3) {
                     if (unbuffered_loop_nop1_ndim3(iter) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
                         goto fail;
                     }
                     goto finish;
@@ -2803,16 +2799,12 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
             case 2:
                 if (ndim == 2) {
                     if (unbuffered_loop_nop2_ndim2(iter) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
                         goto fail;
                     }
                     goto finish;
                 }
                 else if (ndim == 3) {
                     if (unbuffered_loop_nop2_ndim3(iter) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
                         goto fail;
                     }
                     goto finish;
@@ -2823,7 +2815,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     /***************************/
 
     if (NpyIter_Reset(iter, NULL) != NPY_SUCCEED) {
-        Py_DECREF(ret);
         goto fail;
     }
 
@@ -2845,8 +2836,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     if (sop == NULL) {
         PyErr_SetString(PyExc_TypeError,
                     "invalid data type for einsum");
-        Py_DECREF(ret);
-        ret = NULL;
     }
     else if (NpyIter_GetIterSize(iter) != 0) {
         NpyIter_IterNextFunc *iternext;
@@ -2858,7 +2847,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         iternext = NpyIter_GetIterNext(iter, NULL);
         if (iternext == NULL) {
             NpyIter_Deallocate(iter);
-            Py_DECREF(ret);
             goto fail;
         }
         dataptr = NpyIter_GetDataPtrArray(iter);
@@ -2874,12 +2862,16 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
 
         /* If the API was needed, it may have thrown an error */
         if (NpyIter_IterationNeedsAPI(iter) && PyErr_Occurred()) {
-            Py_DECREF(ret);
-            ret = NULL;
+            goto fail;
         }
     }
 
 finish:
+    if (out != NULL) {
+        ret = out;
+    }
+    Py_INCREF(ret);
+
     NpyIter_Deallocate(iter);
     for (iop = 0; iop < nop; ++iop) {
         Py_DECREF(op[iop]);
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 46ff78b9c..f338226c2 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -1540,13 +1540,11 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view,
                                 "cannot use field titles in multi-field index");
                 }
                 if (titlecmp != 0 || PyDict_SetItem(fields, title, tup) < 0) {
-                    Py_DECREF(title);
                     Py_DECREF(name);
                     Py_DECREF(fields);
                     Py_DECREF(names);
                     return 0;
                 }
-                Py_DECREF(title);
             }
             /* disallow duplicate field indices */
             if (PyDict_Contains(fields, name)) {
@@ -2084,7 +2082,7 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
                                                PyArray_TRIVIALLY_ITERABLE_OP_READ,
                                                PyArray_TRIVIALLY_ITERABLE_OP_READ) ||
                  (PyArray_NDIM(tmp_arr) == 0 &&
-                        PyArray_TRIVIALLY_ITERABLE(tmp_arr))) &&
+                        PyArray_TRIVIALLY_ITERABLE(ind))) &&
                 /* Check if the type is equivalent to INTP */
                 PyArray_ITEMSIZE(ind) == sizeof(npy_intp) &&
                 PyArray_DESCR(ind)->kind == 'i' &&
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index d6f2577a3..2e836d1d0 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -976,9 +976,12 @@ array_ufunc(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
     PyObject *ufunc, *method_name, *normal_args, *ufunc_method;
     PyObject *result = NULL;
-    int num_override_args;
+    int has_override;
 
-    if (PyTuple_Size(args) < 2) {
+    assert(PyTuple_CheckExact(args));
+    assert(kwds == NULL || PyDict_CheckExact(kwds));
+
+    if (PyTuple_GET_SIZE(args) < 2) {
         PyErr_SetString(PyExc_TypeError,
                         "__array_ufunc__ requires at least 2 arguments");
         return NULL;
@@ -988,11 +991,11 @@ array_ufunc(PyArrayObject *self, PyObject *args, PyObject *kwds)
         return NULL;
     }
     /* ndarray cannot handle overrides itself */
-    num_override_args = PyUFunc_WithOverride(normal_args, kwds, NULL, NULL);
-    if (num_override_args == -1) {
-        return NULL;
+    has_override = PyUFunc_HasOverride(normal_args, kwds);
+    if (has_override < 0) {
+        goto cleanup;
     }
-    if (num_override_args) {
+    else if (has_override) {
         result = Py_NotImplemented;
         Py_INCREF(Py_NotImplemented);
         goto cleanup;
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index f78a748c0..6e57f1d6d 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -800,102 +800,6 @@ PyArray_CanCoerceScalar(int thistype, int neededtype,
     return 0;
 }
 
-/*
- * Make a new empty array, of the passed size, of a type that takes the
- * priority of ap1 and ap2 into account.
- *
- * If `out` is non-NULL, memory overlap is checked with ap1 and ap2, and an
- * updateifcopy temporary array may be returned. If `result` is non-NULL, the
- * output array to be returned (`out` if non-NULL and the newly allocated array
- * otherwise) is incref'd and put to *result.
- */
-static PyArrayObject *
-new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
-                  int nd, npy_intp dimensions[], int typenum, PyArrayObject **result)
-{
-    PyArrayObject *out_buf;
-
-    if (out) {
-        int d;
-
-        /* verify that out is usable */
-        if (PyArray_NDIM(out) != nd ||
-            PyArray_TYPE(out) != typenum ||
-            !PyArray_ISCARRAY(out)) {
-            PyErr_SetString(PyExc_ValueError,
-                "output array is not acceptable (must have the right datatype, "
-                "number of dimensions, and be a C-Array)");
-            return 0;
-        }
-        for (d = 0; d < nd; ++d) {
-            if (dimensions[d] != PyArray_DIM(out, d)) {
-                PyErr_SetString(PyExc_ValueError,
-                    "output array has wrong dimensions");
-                return 0;
-            }
-        }
-
-        /* check for memory overlap */
-        if (!(solve_may_share_memory(out, ap1, 1) == 0 &&
-              solve_may_share_memory(out, ap2, 1) == 0)) {
-            /* allocate temporary output array */
-            out_buf = (PyArrayObject *)PyArray_NewLikeArray(out, NPY_CORDER,
-                                                            NULL, 0);
-            if (out_buf == NULL) {
-                return NULL;
-            }
-
-            /* set copy-back */
-            Py_INCREF(out);
-            if (PyArray_SetWritebackIfCopyBase(out_buf, out) < 0) {
-                Py_DECREF(out);
-                Py_DECREF(out_buf);
-                return NULL;
-            }
-        }
-        else {
-            Py_INCREF(out);
-            out_buf = out;
-        }
-
-        if (result) {
-            Py_INCREF(out);
-            *result = out;
-        }
-
-        return out_buf;
-    }
-    else {
-        PyTypeObject *subtype;
-        double prior1, prior2;
-        /*
-         * Need to choose an output array that can hold a sum
-         * -- use priority to determine which subtype.
-         */
-        if (Py_TYPE(ap2) != Py_TYPE(ap1)) {
-            prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
-            prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
-            subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
-        }
-        else {
-            prior1 = prior2 = 0.0;
-            subtype = Py_TYPE(ap1);
-        }
-
-        out_buf = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
-                                               typenum, NULL, NULL, 0, 0,
-                                               (PyObject *)
-                                               (prior2 > prior1 ? ap2 : ap1));
-
-        if (out_buf != NULL && result) {
-            Py_INCREF(out_buf);
-            *result = out_buf;
-        }
-
-        return out_buf;
-    }
-}
-
 /* Could perhaps be redone to not make contiguous arrays */
 
 /*NUMPY_API
@@ -1101,7 +1005,7 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(ap2));
     while (it1->index < it1->size) {
         while (it2->index < it2->size) {
-            dot(it1->dataptr, is1, it2->dataptr, is2, op, l, out_buf);
+            dot(it1->dataptr, is1, it2->dataptr, is2, op, l, NULL);
             op += os;
             PyArray_ITER_NEXT(it2);
         }
@@ -4441,7 +4345,7 @@ static struct PyMethodDef array_module_methods[] = {
         "indicated by mask."},
     {"bincount", (PyCFunction)arr_bincount,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"digitize", (PyCFunction)arr_digitize,
+    {"_monotonicity", (PyCFunction)arr__monotonicity,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"interp", (PyCFunction)arr_interp,
         METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h
index 094612b7d..f9c671f77 100644
--- a/numpy/core/src/private/lowlevel_strided_loops.h
+++ b/numpy/core/src/private/lowlevel_strided_loops.h
@@ -689,21 +689,16 @@ npy_bswap8_unaligned(char * x)
 #define PyArray_TRIVIALLY_ITERABLE_OP_NOREAD 0
 #define PyArray_TRIVIALLY_ITERABLE_OP_READ 1
 
-#define PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) (            \
-                        PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
-                        PyArray_CompareLists(PyArray_DIMS(arr1), \
-                                             PyArray_DIMS(arr2), \
-                                             PyArray_NDIM(arr1)) && \
-                        (PyArray_FLAGS(arr1)&(NPY_ARRAY_C_CONTIGUOUS| \
-                                      NPY_ARRAY_F_CONTIGUOUS)) & \
-                                (PyArray_FLAGS(arr2)&(NPY_ARRAY_C_CONTIGUOUS| \
-                                              NPY_ARRAY_F_CONTIGUOUS)) \
-                        )
+#define PyArray_TRIVIALLY_ITERABLE(arr) ( \
+                    PyArray_NDIM(arr) <= 1 || \
+                    PyArray_CHKFLAGS(arr, NPY_ARRAY_C_CONTIGUOUS) || \
+                    PyArray_CHKFLAGS(arr, NPY_ARRAY_F_CONTIGUOUS) \
+                    )
 
 #define PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size, arr) ( \
-                        size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
-                                          PyArray_STRIDE(arr, 0) : \
-                                          PyArray_ITEMSIZE(arr)))
+        assert(PyArray_TRIVIALLY_ITERABLE(arr)), \
+        size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
+                             PyArray_STRIDE(arr, 0) : PyArray_ITEMSIZE(arr)))
 
 static NPY_INLINE int
 PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr2,
@@ -757,15 +752,22 @@ PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr
     return (!arr1_read || arr1_ahead) && (!arr2_read || arr2_ahead);
 }
 
+#define PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) (            \
+                        PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
+                        PyArray_CompareLists(PyArray_DIMS(arr1), \
+                                             PyArray_DIMS(arr2), \
+                                             PyArray_NDIM(arr1)) && \
+                        (PyArray_FLAGS(arr1)&(NPY_ARRAY_C_CONTIGUOUS| \
+                                      NPY_ARRAY_F_CONTIGUOUS)) & \
+                                (PyArray_FLAGS(arr2)&(NPY_ARRAY_C_CONTIGUOUS| \
+                                              NPY_ARRAY_F_CONTIGUOUS)) \
+                        )
+
 #define PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2, arr1_read, arr2_read) ( \
                         PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) && \
                         PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK( \
                             arr1, arr2, arr1_read, arr2_read))
-#define PyArray_TRIVIALLY_ITERABLE(arr) ( \
-                    PyArray_NDIM(arr) <= 1 || \
-                    PyArray_CHKFLAGS(arr, NPY_ARRAY_C_CONTIGUOUS) || \
-                    PyArray_CHKFLAGS(arr, NPY_ARRAY_F_CONTIGUOUS) \
-                    )
+
 #define PyArray_PREPARE_TRIVIAL_ITERATION(arr, count, data, stride) \
                     count = PyArray_SIZE(arr); \
                     data = PyArray_BYTES(arr); \
@@ -774,7 +776,6 @@ PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr
                                             PyArray_STRIDE(arr, 0) : \
                                             PyArray_ITEMSIZE(arr)));
 
-
 #define PyArray_TRIVIALLY_ITERABLE_PAIR(arr1, arr2, arr1_read, arr2_read) (   \
                     PyArray_TRIVIALLY_ITERABLE(arr1) && \
                         (PyArray_NDIM(arr2) == 0 || \
diff --git a/numpy/core/src/private/npy_config.h b/numpy/core/src/private/npy_config.h
index 107b3cb5b..8143e7719 100644
--- a/numpy/core/src/private/npy_config.h
+++ b/numpy/core/src/private/npy_config.h
@@ -15,7 +15,8 @@
  * amd64 is not harmed much by the bloat as the system provides 16 byte
  * alignment by default.
  */
-#if (defined NPY_CPU_X86 || defined _WIN32)
+#if (defined NPY_CPU_X86 || defined _WIN32 || defined NPY_CPU_ARMEL_AARCH32 ||\
+     defined NPY_CPU_ARMEB_AARCH32)
 #define NPY_MAX_COPY_ALIGNMENT 8
 #else
 #define NPY_MAX_COPY_ALIGNMENT 16
diff --git a/numpy/core/src/private/ufunc_override.c b/numpy/core/src/private/ufunc_override.c
index 116da3267..69c3cc56c 100644
--- a/numpy/core/src/private/ufunc_override.c
+++ b/numpy/core/src/private/ufunc_override.c
@@ -54,11 +54,71 @@ get_non_default_array_ufunc(PyObject *obj)
 }
 
 /*
- * Check whether a set of input and output args have a non-default
- *  `__array_ufunc__` method. Return the number of overrides, setting
- * corresponding objects in PyObject array with_override and the corresponding
- * __array_ufunc__ methods in methods (both only if not NULL, and both using
- * new references).
+ * Check whether an object has __array_ufunc__ defined on its class and it
+ * is not the default, i.e., the object is not an ndarray, and its
+ * __array_ufunc__ is not the same as that of ndarray.
+ *
+ * Returns 1 if this is the case, 0 if not.
+ */
+
+static int
+has_non_default_array_ufunc(PyObject * obj)
+{
+    PyObject *method = get_non_default_array_ufunc(obj);
+    if (method) {
+        Py_DECREF(method);
+        return 1;
+    }
+    else {
+        return 0;
+    }
+}
+
+/*
+ * Get possible out argument from kwds, and returns the number of outputs
+ * contained within it: if a tuple, the number of elements in it, 1 otherwise.
+ * The out argument itself is returned in out_kwd_obj, and the outputs
+ * in the out_obj array (all as borrowed references).
+ *
+ * Returns -1 if kwds is not a dict, 0 if no outputs found.
+ */
+static int
+get_out_objects(PyObject *kwds, PyObject **out_kwd_obj, PyObject ***out_objs)
+{
+    if (kwds == NULL) {
+        return 0;
+    }
+    if (!PyDict_CheckExact(kwds)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Internal Numpy error: call to PyUFunc_WithOverride "
+                        "with non-dict kwds");
+        return -1;
+    }
+    /* borrowed reference */
+    *out_kwd_obj = PyDict_GetItemString(kwds, "out");
+    if (*out_kwd_obj == NULL) {
+        return 0;
+    }
+    if (PyTuple_CheckExact(*out_kwd_obj)) {
+        *out_objs = PySequence_Fast_ITEMS(*out_kwd_obj);
+        return PySequence_Fast_GET_SIZE(*out_kwd_obj);
+    }
+    else {
+        *out_objs = out_kwd_obj;
+        return 1;
+    }
+}
+
+/*
+ * For each positional argument and each argument in a possible "out"
+ * keyword, look for overrides of the standard ufunc behaviour, i.e.,
+ * non-default __array_ufunc__ methods.
+ *
+ * Returns the number of overrides, setting corresponding objects
+ * in PyObject array ``with_override`` and the corresponding
+ * __array_ufunc__ methods in ``methods`` (both using new references).
+ *
+ * Only the first override for a given class is returned.
  *
  * returns -1 on failure.
  */
@@ -67,64 +127,52 @@ PyUFunc_WithOverride(PyObject *args, PyObject *kwds,
                      PyObject **with_override, PyObject **methods)
 {
     int i;
-
-    int nargs;
-    int nout_kwd = 0;
-    int out_kwd_is_tuple = 0;
     int num_override_args = 0;
+    int narg, nout = 0;
+    PyObject *out_kwd_obj;
+    PyObject **arg_objs, **out_objs;
 
-    PyObject *obj;
-    PyObject *out_kwd_obj = NULL;
-    /*
-     * Check inputs
-     */
-    if (!PyTuple_Check(args)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "Internal Numpy error: call to PyUFunc_HasOverride "
-                        "with non-tuple");
-        goto fail;
+    narg = PyTuple_Size(args);
+    if (narg < 0) {
+        return -1;
     }
-    nargs = PyTuple_GET_SIZE(args);
-    if (nargs > NPY_MAXARGS) {
-        PyErr_SetString(PyExc_TypeError,
-                        "Internal Numpy error: too many arguments in call "
-                        "to PyUFunc_HasOverride");
-        goto fail;
-    }
-    /* be sure to include possible 'out' keyword argument. */
-    if (kwds && PyDict_CheckExact(kwds)) {
-        out_kwd_obj = PyDict_GetItemString(kwds, "out");
-        if (out_kwd_obj != NULL) {
-            out_kwd_is_tuple = PyTuple_CheckExact(out_kwd_obj);
-            if (out_kwd_is_tuple) {
-                nout_kwd = PyTuple_GET_SIZE(out_kwd_obj);
-            }
-            else {
-                nout_kwd = 1;
-            }
-        }
+    arg_objs = PySequence_Fast_ITEMS(args);
+
+    nout = get_out_objects(kwds, &out_kwd_obj, &out_objs);
+    if (nout < 0) {
+        return -1;
     }
 
-    for (i = 0; i < nargs + nout_kwd; ++i) {
-        PyObject *method;
-        if (i < nargs) {
-            obj = PyTuple_GET_ITEM(args, i);
+    for (i = 0; i < narg + nout; ++i) {
+        PyObject *obj;
+        int j;
+        int new_class = 1;
+
+        if (i < narg) {
+            obj = arg_objs[i];
         }
         else {
-            if (out_kwd_is_tuple) {
-                obj = PyTuple_GET_ITEM(out_kwd_obj, i - nargs);
-            }
-            else {
-                obj = out_kwd_obj;
-            }
+            obj = out_objs[i - narg];
         }
         /*
-         * Now see if the object provides an __array_ufunc__. However, we should
-         * ignore the base ndarray.__ufunc__, so we skip any ndarray as well as
-         * any ndarray subclass instances that did not override __array_ufunc__.
+         * Have we seen this class before?  If so, ignore.
          */
-        method = get_non_default_array_ufunc(obj);
-        if (method != NULL) {
+        for (j = 0; j < num_override_args; j++) {
+            new_class = (Py_TYPE(obj) != Py_TYPE(with_override[j]));
+            if (!new_class) {
+                break;
+            }
+        }
+        if (new_class) {
+            /*
+             * Now see if the object provides an __array_ufunc__. However, we should
+             * ignore the base ndarray.__ufunc__, so we skip any ndarray as well as
+             * any ndarray subclass instances that did not override __array_ufunc__.
+             */
+            PyObject *method = get_non_default_array_ufunc(obj);
+            if (method == NULL) {
+                continue;
+            }
             if (method == Py_None) {
                 PyErr_Format(PyExc_TypeError,
                              "operand '%.200s' does not support ufuncs "
@@ -133,31 +181,61 @@ PyUFunc_WithOverride(PyObject *args, PyObject *kwds,
                 Py_DECREF(method);
                 goto fail;
             }
-            if (with_override != NULL) {
-                Py_INCREF(obj);
-                with_override[num_override_args] = obj;
-            }
-            if (methods != NULL) {
-                methods[num_override_args] = method;
-            }
-            else {
-                Py_DECREF(method);
-            }
+            Py_INCREF(obj);
+            with_override[num_override_args] = obj;
+            methods[num_override_args] = method;
             ++num_override_args;
         }
     }
     return num_override_args;
 
 fail:
-    if (methods != NULL) {
-        for (i = 0; i < num_override_args; i++) {
-            Py_DECREF(methods[i]);
+    for (i = 0; i < num_override_args; i++) {
+        Py_DECREF(with_override[i]);
+        Py_DECREF(methods[i]);
+    }
+    return -1;
+}
+
+/*
+ * Check whether any of a set of input and output args have a non-default
+ * __array_ufunc__ method. Return 1 if so, 0 if not.
+ *
+ * This function primarily exists to help ndarray.__array_ufunc__ determine
+ * whether it can support a ufunc (which is the case only if none of the
+ * operands have an override).  Thus, unlike in PyUFunc_CheckOverride, the
+ * actual overrides are not needed and one can stop looking once one is found.
+ *
+ * TODO: move this function and has_non_default_array_ufunc closer to ndarray.
+ */
+NPY_NO_EXPORT int
+PyUFunc_HasOverride(PyObject *args, PyObject *kwds)
+{
+    int i;
+    int nin, nout;
+    PyObject *out_kwd_obj;
+    PyObject **in_objs, **out_objs;
+
+    /* check inputs */
+    nin = PyTuple_Size(args);
+    if (nin < 0) {
+        return -1;
+    }
+    in_objs = PySequence_Fast_ITEMS(args);
+    for (i = 0; i < nin; ++i) {
+        if (has_non_default_array_ufunc(in_objs[i])) {
+            return 1;
         }
     }
-    if (with_override != NULL) {
-        for (i = 0; i < num_override_args; i++) {
-            Py_DECREF(with_override[i]);
+    /* check outputs, if any */
+    nout = get_out_objects(kwds, &out_kwd_obj, &out_objs);
+    if (nout < 0) {
+        return -1;
+    }
+    for (i = 0; i < nout; i++) {
+        if (has_non_default_array_ufunc(out_objs[i])) {
+            return 1;
         }
     }
-    return -1;
+    return 0;
 }
diff --git a/numpy/core/src/private/ufunc_override.h b/numpy/core/src/private/ufunc_override.h
index 2ed1c626f..fd1ee2135 100644
--- a/numpy/core/src/private/ufunc_override.h
+++ b/numpy/core/src/private/ufunc_override.h
@@ -12,4 +12,7 @@
 NPY_NO_EXPORT int
 PyUFunc_WithOverride(PyObject *args, PyObject *kwds,
                      PyObject **with_override, PyObject **methods);
+
+NPY_NO_EXPORT int
+PyUFunc_HasOverride(PyObject *args, PyObject *kwds);
 #endif
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index b964c568e..a3fd72839 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -552,6 +552,181 @@ ufunc_get_name_cstr(PyUFuncObject *ufunc) {
 }
 
 /*
+ * Helpers for keyword parsing
+ */
+
+/*
+ * Find key in a list of pointers to keyword names.
+ * The list should end with NULL.
+ *
+ * Returns either the index into the list (pointing to the final key with NULL
+ * if no match was found), or -1 on failure.
+ */
+static npy_intp
+locate_key(PyObject **kwnames, PyObject *key)
+{
+    PyObject **kwname = kwnames;
+    while (*kwname != NULL && *kwname != key) {
+        kwname++;
+    }
+    /* Slow fallback, just in case */
+    if (*kwname == NULL) {
+        int cmp = 0;
+        kwname = kwnames;
+        while (*kwname != NULL &&
+               (cmp = PyObject_RichCompareBool(key, *kwname,
+                                               Py_EQ)) == 0) {
+            kwname++;
+        }
+        if (cmp < 0) {
+            return -1;
+        }
+    }
+    return kwname - kwnames;
+}
+
+/*
+ * Parse keyword arguments, matching against kwnames
+ *
+ * Arguments beyond kwnames (the va_list) should contain converters and outputs
+ * for each keyword name (where an output can be NULL to indicate the particular
+ * keyword should be ignored).
+ *
+ * Returns 0 on success, -1 on failure with an error set.
+ *
+ * Note that the parser does not clean up on failure, i.e., already parsed keyword
+ * values may hold new references, which the caller has to remove.
+ *
+ * TODO: ufunc is only used for the name in error messages; passing on the
+ *       name instead might be an option.
+ *
+ * TODO: instead of having this function ignore of keywords for which the
+ *       corresponding output is NULL, the calling routine should prepare the
+ *       correct list.
+ */
+static int
+parse_ufunc_keywords(PyUFuncObject *ufunc, PyObject *kwds, PyObject **kwnames, ...)
+{
+    va_list va;
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+    typedef int converter(PyObject *, void *);
+
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        int i;
+        converter *convert;
+        void *output = NULL;
+        npy_intp index = locate_key(kwnames, key);
+        if (index < 0) {
+            return -1;
+        }
+        if (kwnames[index]) {
+            va_start(va, kwnames);
+            for (i = 0; i <= index; i++) {
+                convert = va_arg(va, converter *);
+                output = va_arg(va, void *);
+            }
+            va_end(va);
+        }
+        if (output) {
+            if (!convert(value, output)) {
+                return -1;
+            }
+        }
+        else {
+#if PY_VERSION_HEX >= 0x03000000
+            PyErr_Format(PyExc_TypeError,
+                         "'%S' is an invalid keyword to ufunc '%s'",
+                         key, ufunc_get_name_cstr(ufunc));
+#else
+            char *str = PyString_AsString(key);
+            if (str == NULL) {
+                PyErr_Clear();
+                PyErr_SetString(PyExc_TypeError, "invalid keyword argument");
+            }
+            else {
+                PyErr_Format(PyExc_TypeError,
+                             "'%s' is an invalid keyword to ufunc '%s'",
+                             str, ufunc_get_name_cstr(ufunc));
+            }
+#endif
+            return -1;
+        }
+    }
+    return 0;
+}
+
+/*
+ * Converters for use in parsing of keywords arguments.
+ */
+NPY_NO_EXPORT int
+_subok_converter(PyObject *obj, int *subok)
+{
+    if (PyBool_Check(obj)) {
+        *subok = (obj == Py_True);
+        return NPY_SUCCEED;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "'subok' must be a boolean");
+        return NPY_FAIL;
+    }
+}
+
+NPY_NO_EXPORT int
+_keepdims_converter(PyObject *obj, int *keepdims)
+{
+    if (PyBool_Check(obj)) {
+        *keepdims = (obj == Py_True);
+        return NPY_SUCCEED;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "'keepdims' must be a boolean");
+        return NPY_FAIL;
+    }
+}
+
+NPY_NO_EXPORT int
+_wheremask_converter(PyObject *obj, PyArrayObject **wheremask)
+{
+    /*
+     * Optimization: where=True is the same as no where argument.
+     * This lets us document True as the default.
+     */
+    if (obj == Py_True) {
+        return NPY_SUCCEED;
+    }
+    else {
+        PyArray_Descr *dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (dtype == NULL) {
+            return NPY_FAIL;
+        }
+        /* PyArray_FromAny steals reference to dtype, even on failure */
+        *wheremask = (PyArrayObject *)PyArray_FromAny(obj, dtype, 0, 0, 0, NULL);
+        if ((*wheremask) == NULL) {
+            return NPY_FAIL;
+        }
+        return NPY_SUCCEED;
+    }
+}
+
+NPY_NO_EXPORT int
+_new_reference(PyObject *obj, PyObject **out)
+{
+    Py_INCREF(obj);
+    *out = obj;
+    return NPY_SUCCEED;
+}
+
+NPY_NO_EXPORT int
+_borrowed_reference(PyObject *obj, PyObject **out)
+{
+    *out = obj;
+    return NPY_SUCCEED;
+}
+
+/*
  * Parses the positional and keyword arguments for a generic ufunc call.
  * All returned arguments are new references (with optional ones NULL
  * if not present)
@@ -575,12 +750,9 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
     int nout = ufunc->nout;
     int nop = ufunc->nargs;
     PyObject *obj, *context;
-    PyObject *str_key_obj = NULL;
-    const char *ufunc_name = ufunc_get_name_cstr(ufunc);
-    int has_sig = 0;
-
+    PyArray_Descr *dtype = NULL;
     /*
-     * Initialize objects so caller knows when outputs and other optional
+     * Initialize output objects so caller knows when outputs and optional
      * arguments are set (also means we can safely XDECREF on failure).
      */
     for (i = 0; i < nop; i++) {
@@ -646,253 +818,149 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
     }
 
     /*
-     * Get keyword output and other arguments.
-     * Raise an error if anything else is present in the
-     * keyword dictionary.
+     * If keywords are present, get keyword output and other arguments.
+     * Raise an error if anything else is present in the keyword dictionary.
      */
-    if (kwds != NULL) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwds, &pos, &key, &value)) {
-            Py_ssize_t length = 0;
-            char *str = NULL;
-            int bad_arg = 1;
-
-#if defined(NPY_PY3K)
-            Py_XDECREF(str_key_obj);
-            str_key_obj = PyUnicode_AsASCIIString(key);
-            if (str_key_obj != NULL) {
-                key = str_key_obj;
-            }
-#endif
-
-            if (PyBytes_AsStringAndSize(key, &str, &length) < 0) {
-                PyErr_Clear();
-                PyErr_SetString(PyExc_TypeError, "invalid keyword argument");
+    if (kwds) {
+        PyObject *out_kwd = NULL;
+        PyObject *sig = NULL;
+        static PyObject *kwnames[13] = {NULL};
+        if (kwnames[0] == NULL) {
+            kwnames[0] = npy_um_str_out;
+            kwnames[1] = npy_um_str_where;
+            kwnames[2] = npy_um_str_axes;
+            kwnames[3] = npy_um_str_axis;
+            kwnames[4] = npy_um_str_keepdims;
+            kwnames[5] = npy_um_str_casting;
+            kwnames[6] = npy_um_str_order;
+            kwnames[7] = npy_um_str_dtype;
+            kwnames[8] = npy_um_str_subok;
+            kwnames[9] = npy_um_str_signature;
+            kwnames[10] = npy_um_str_sig;
+            kwnames[11] = npy_um_str_extobj;
+            kwnames[12] = NULL;  /* sentinel */
+        }
+        /*
+         * Parse using converters to calculate outputs
+         * (NULL outputs are treated as indicating a keyword is not allowed).
+         */
+        if (parse_ufunc_keywords(
+                ufunc, kwds, kwnames,
+                _borrowed_reference, &out_kwd,
+                _wheremask_converter, out_wheremask,  /* new reference */
+                _new_reference, out_axes,
+                _new_reference, out_axis,
+                _keepdims_converter, out_keepdims,
+                PyArray_CastingConverter, out_casting,
+                PyArray_OrderConverter, out_order,
+                PyArray_DescrConverter2, &dtype,   /* new reference */
+                _subok_converter, out_subok,
+                _new_reference, out_typetup,
+                _borrowed_reference, &sig,
+                _new_reference, out_extobj) < 0) {
+            goto fail;
+        }
+        /*
+         * Check that outputs were not passed as positional as well,
+         * and that they are either None or an array.
+         */
+        if (out_kwd) {  /* borrowed reference */
+            /*
+             * Output arrays are generally specified as a tuple of arrays
+             * and None, but may be a single array or None for ufuncs
+             * with a single output.
+             */
+            if (nargs > nin) {
+                PyErr_SetString(PyExc_ValueError,
+                                "cannot specify 'out' as both a "
+                                "positional and keyword argument");
                 goto fail;
             }
-
-            switch (str[0]) {
-                case 'a':
-                    /* possible axes argument for generalized ufunc */
-                    if (out_axes != NULL && strcmp(str, "axes") == 0) {
-                        if (out_axis != NULL && *out_axis != NULL) {
-                            PyErr_SetString(PyExc_TypeError,
-                                "cannot specify both 'axis' and 'axes'");
-                            goto fail;
-                        }
-                        Py_INCREF(value);
-                        *out_axes = value;
-                        bad_arg = 0;
-                    }
-                    else if (out_axis != NULL && strcmp(str, "axis") == 0) {
-                        if (out_axes != NULL && *out_axes != NULL) {
-                            PyErr_SetString(PyExc_TypeError,
-                                "cannot specify both 'axis' and 'axes'");
-                            goto fail;
-                        }
-                        Py_INCREF(value);
-                        *out_axis = value;
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'c':
-                    /* Provides a policy for allowed casting */
-                    if (strcmp(str, "casting") == 0) {
-                        if (!PyArray_CastingConverter(value, out_casting)) {
-                            goto fail;
-                        }
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'd':
-                    /* Another way to specify 'sig' */
-                    if (strcmp(str, "dtype") == 0) {
-                        /* Allow this parameter to be None */
-                        PyArray_Descr *dtype;
-                        if (!PyArray_DescrConverter2(value, &dtype)) {
-                            goto fail;
-                        }
-                        if (dtype != NULL) {
-                            if (*out_typetup != NULL) {
-                                PyErr_SetString(PyExc_RuntimeError,
-                                    "cannot specify both 'signature' and 'dtype'");
-                                goto fail;
-                            }
-                            *out_typetup = Py_BuildValue("(N)", dtype);
-                        }
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'e':
-                    /*
-                     * Overrides the global parameters buffer size,
-                     * error mask, and error object
-                     */
-                    if (strcmp(str, "extobj") == 0) {
-                        Py_INCREF(value);
-                        *out_extobj = value;
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'k':
-                    if (out_keepdims != NULL && strcmp(str, "keepdims") == 0) {
-                        if (!PyBool_Check(value)) {
-                            PyErr_SetString(PyExc_TypeError,
-                                        "'keepdims' must be a boolean");
-                            goto fail;
-                        }
-                        *out_keepdims = (value == Py_True);
-                        bad_arg = 0;
+            if (PyTuple_CheckExact(out_kwd)) {
+                if (PyTuple_GET_SIZE(out_kwd) != nout) {
+                    PyErr_SetString(PyExc_ValueError,
+                                    "The 'out' tuple must have exactly "
+                                    "one entry per ufunc output");
+                    goto fail;
+                }
+                /* 'out' must be a tuple of arrays and Nones */
+                for(i = 0; i < nout; ++i) {
+                    PyObject *val = PyTuple_GET_ITEM(out_kwd, i);
+                    if (_set_out_array(val, out_op+nin+i) < 0) {
+                        goto fail;
                     }
-                    break;
-                case 'o':
-                    /*
-                     * Output arrays may be specified as a keyword argument,
-                     * either as a single array or None for single output
-                     * ufuncs, or as a tuple of arrays and Nones.
-                     */
-                    if (strcmp(str, "out") == 0) {
-                        if (nargs > nin) {
-                            PyErr_SetString(PyExc_ValueError,
-                                    "cannot specify 'out' as both a "
-                                    "positional and keyword argument");
-                            goto fail;
-                        }
-                        if (PyTuple_CheckExact(value)) {
-                            if (PyTuple_GET_SIZE(value) != nout) {
-                                PyErr_SetString(PyExc_ValueError,
-                                        "The 'out' tuple must have exactly "
-                                        "one entry per ufunc output");
-                                goto fail;
-                            }
-                            /* 'out' must be a tuple of arrays and Nones */
-                            for(i = 0; i < nout; ++i) {
-                                PyObject *val = PyTuple_GET_ITEM(value, i);
-                                if (_set_out_array(val, out_op+nin+i) < 0) {
-                                    goto fail;
-                                }
-                            }
-                        }
-                        else if (nout == 1) {
-                            /* Can be an array if it only has one output */
-                            if (_set_out_array(value, out_op + nin) < 0) {
-                                goto fail;
-                            }
-                        }
-                        else {
-                            /*
-                             * If the deprecated behavior is ever removed,
-                             * keep only the else branch of this if-else
-                             */
-                            if (PyArray_Check(value) || value == Py_None) {
-                                if (DEPRECATE("passing a single array to the "
-                                              "'out' keyword argument of a "
-                                              "ufunc with\n"
-                                              "more than one output will "
-                                              "result in an error in the "
-                                              "future") < 0) {
-                                    /* The future error message */
-                                    PyErr_SetString(PyExc_TypeError,
+                }
+            }
+            else if (nout == 1) {
+                /* Can be an array if it only has one output */
+                if (_set_out_array(out_kwd, out_op + nin) < 0) {
+                    goto fail;
+                }
+            }
+            else {
+                /*
+                 * If the deprecated behavior is ever removed,
+                 * keep only the else branch of this if-else
+                 */
+                if (PyArray_Check(out_kwd) || out_kwd == Py_None) {
+                    if (DEPRECATE("passing a single array to the "
+                                  "'out' keyword argument of a "
+                                  "ufunc with\n"
+                                  "more than one output will "
+                                  "result in an error in the "
+                                  "future") < 0) {
+                        /* The future error message */
+                        PyErr_SetString(PyExc_TypeError,
                                         "'out' must be a tuple of arrays");
-                                    goto fail;
-                                }
-                                if (_set_out_array(value, out_op+nin) < 0) {
-                                    goto fail;
-                                }
-                            }
-                            else {
-                                PyErr_SetString(PyExc_TypeError,
-                                    nout > 1 ? "'out' must be a tuple "
-                                               "of arrays" :
-                                               "'out' must be an array or a "
-                                               "tuple of a single array");
-                                goto fail;
-                            }
-                        }
-                        bad_arg = 0;
+                        goto fail;
                     }
-                    /* Allows the default output layout to be overridden */
-                    else if (strcmp(str, "order") == 0) {
-                        if (!PyArray_OrderConverter(value, out_order)) {
-                            goto fail;
-                        }
-                        bad_arg = 0;
+                    if (_set_out_array(out_kwd, out_op+nin) < 0) {
+                        goto fail;
                     }
-                    break;
-                case 's':
-                    /* Allows a specific function inner loop to be selected */
-                    if (strcmp(str, "sig") == 0 ||
-                            strcmp(str, "signature") == 0) {
-                        if (has_sig == 1) {
-                            PyErr_SetString(PyExc_ValueError,
+                }
+                else {
+                    PyErr_SetString(PyExc_TypeError,
+                                    nout > 1 ? "'out' must be a tuple "
+                                    "of arrays" :
+                                    "'out' must be an array or a "
+                                    "tuple of a single array");
+                    goto fail;
+                }
+            }
+        }
+        /*
+         * Check we did not get both axis and axes, or multiple ways
+         * to define a signature.
+         */
+        if (out_axes != NULL && out_axis != NULL &&
+            *out_axes != NULL && *out_axis != NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                            "cannot specify both 'axis' and 'axes'");
+            goto fail;
+        }
+        if (sig) {  /* borrowed reference */
+            if (*out_typetup != NULL) {
+                PyErr_SetString(PyExc_ValueError,
                                 "cannot specify both 'sig' and 'signature'");
-                            goto fail;
-                        }
-                        if (*out_typetup != NULL) {
-                            PyErr_SetString(PyExc_RuntimeError,
-                                    "cannot specify both 'signature' and 'dtype'");
-                            goto fail;
-                        }
-                        Py_INCREF(value);
-                        *out_typetup = value;
-                        bad_arg = 0;
-                        has_sig = 1;
-                    }
-                    else if (strcmp(str, "subok") == 0) {
-                        if (!PyBool_Check(value)) {
-                            PyErr_SetString(PyExc_TypeError,
-                                        "'subok' must be a boolean");
-                            goto fail;
-                        }
-                        *out_subok = (value == Py_True);
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'w':
-                    /*
-                     * Provides a boolean array 'where=' mask if
-                     * out_wheremask is supplied.
-                     */
-                    if (out_wheremask != NULL && strcmp(str, "where") == 0) {
-                        PyArray_Descr *dtype;
-                        dtype = PyArray_DescrFromType(NPY_BOOL);
-                        if (dtype == NULL) {
-                            goto fail;
-                        }
-                        if (value == Py_True) {
-                            /*
-                             * Optimization: where=True is the same as no
-                             * where argument. This lets us document it as a
-                             * default argument
-                             */
-                            bad_arg = 0;
-                            break;
-                        }
-                        *out_wheremask = (PyArrayObject *)PyArray_FromAny(
-                                                            value, dtype,
-                                                            0, 0, 0, NULL);
-                        if (*out_wheremask == NULL) {
-                            goto fail;
-                        }
-                        bad_arg = 0;
-                    }
-                    break;
+                goto fail;
             }
-
-            if (bad_arg) {
-                char *format = "'%s' is an invalid keyword to ufunc '%s'";
-                PyErr_Format(PyExc_TypeError, format, str, ufunc_name);
+            Py_INCREF(sig);
+            *out_typetup = sig;
+        }
+        if (dtype) {  /* new reference */
+            if (*out_typetup != NULL) {
+                PyErr_SetString(PyExc_RuntimeError,
+                                "cannot specify both 'signature' and 'dtype'");
                 goto fail;
             }
+            /* Note: "N" uses the reference */
+            *out_typetup = Py_BuildValue("(N)", dtype);
         }
     }
-    Py_XDECREF(str_key_obj);
-
     return 0;
 
 fail:
-    Py_XDECREF(str_key_obj);
+    Py_XDECREF(dtype);
     Py_XDECREF(*out_typetup);
     Py_XDECREF(*out_extobj);
     if (out_wheremask != NULL) {
diff --git a/numpy/core/src/umath/ufunc_object.h b/numpy/core/src/umath/ufunc_object.h
index d6fd3837a..5438270f1 100644
--- a/numpy/core/src/umath/ufunc_object.h
+++ b/numpy/core/src/umath/ufunc_object.h
@@ -10,13 +10,23 @@ ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
 NPY_NO_EXPORT const char*
 ufunc_get_name_cstr(PyUFuncObject *ufunc);
 
-/* interned strings (on umath import) */
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_out;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_subok;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_array_prepare;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_array_wrap;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_array_finalize;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_ufunc;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_pyvals_name;
+/* strings from umathmodule.c that are interned on umath import */
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_out;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_where;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_axes;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_axis;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_keepdims;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_casting;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_order;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_dtype;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_subok;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_signature;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_sig;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_extobj;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_prepare;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_wrap;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_finalize;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_ufunc;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_pyvals_name;
 
 #endif
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 5567b9bbf..9291a5138 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -226,20 +226,40 @@ add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
  *****************************************************************************
  */
 
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_out = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_subok = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_array_prepare = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_array_wrap = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_array_finalize = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_ufunc = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_pyvals_name = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_out = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_where = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_axes = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_axis = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_keepdims = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_casting = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_order = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_dtype = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_subok = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_signature = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_sig = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_extobj = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_prepare = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_wrap = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_finalize = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_ufunc = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_pyvals_name = NULL;
 
 /* intern some strings used in ufuncs */
 static int
 intern_strings(void)
 {
     npy_um_str_out = PyUString_InternFromString("out");
+    npy_um_str_where = PyUString_InternFromString("where");
+    npy_um_str_axes = PyUString_InternFromString("axes");
+    npy_um_str_axis = PyUString_InternFromString("axis");
+    npy_um_str_keepdims = PyUString_InternFromString("keepdims");
+    npy_um_str_casting = PyUString_InternFromString("casting");
+    npy_um_str_order = PyUString_InternFromString("order");
+    npy_um_str_dtype = PyUString_InternFromString("dtype");
     npy_um_str_subok = PyUString_InternFromString("subok");
+    npy_um_str_signature = PyUString_InternFromString("signature");
+    npy_um_str_sig = PyUString_InternFromString("sig");
+    npy_um_str_extobj = PyUString_InternFromString("extobj");
     npy_um_str_array_prepare = PyUString_InternFromString("__array_prepare__");
     npy_um_str_array_wrap = PyUString_InternFromString("__array_wrap__");
     npy_um_str_array_finalize = PyUString_InternFromString("__array_finalize__");
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index 647738831..8ce374a75 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -16,7 +16,7 @@ for size, char in zip(sizes, chars):
     global_size_dict[char] = size
 
 
-class TestEinSum(object):
+class TestEinsum(object):
     def test_einsum_errors(self):
         for do_opt in [True, False]:
             # Need enough arguments
@@ -614,7 +614,7 @@ class TestEinSum(object):
         np.einsum(a, [0, 51], b, [51, 2], [0, 2], optimize=False)
         assert_raises(ValueError, lambda: np.einsum(a, [0, 52], b, [52, 2], [0, 2], optimize=False))
         assert_raises(ValueError, lambda: np.einsum(a, [-1, 5], b, [5, 2], [-1, 2], optimize=False))
-        
+
     def test_einsum_broadcast(self):
         # Issue #2455 change in handling ellipsis
         # remove the 'middle broadcast' error
@@ -730,19 +730,27 @@ class TestEinSum(object):
         res = np.einsum('...ij,...jk->...ik', a, a, out=out)
         assert_equal(res, tgt)
 
-    def optimize_compare(self, string):
+    def test_out_is_res(self):
+        a = np.arange(9).reshape(3, 3)
+        res = np.einsum('...ij,...jk->...ik', a, a, out=a)
+        assert res is a
+
+    def optimize_compare(self, subscripts, operands=None):
         # Tests all paths of the optimization function against
         # conventional einsum
-        operands = [string]
-        terms = string.split('->')[0].split(',')
-        for term in terms:
-            dims = [global_size_dict[x] for x in term]
-            operands.append(np.random.rand(*dims))
-
-        noopt = np.einsum(*operands, optimize=False)
-        opt = np.einsum(*operands, optimize='greedy')
+        if operands is None:
+            args = [subscripts]
+            terms = subscripts.split('->')[0].split(',')
+            for term in terms:
+                dims = [global_size_dict[x] for x in term]
+                args.append(np.random.rand(*dims))
+        else:
+            args = [subscripts] + operands
+
+        noopt = np.einsum(*args, optimize=False)
+        opt = np.einsum(*args, optimize='greedy')
         assert_almost_equal(opt, noopt)
-        opt = np.einsum(*operands, optimize='optimal')
+        opt = np.einsum(*args, optimize='optimal')
         assert_almost_equal(opt, noopt)
 
     def test_hadamard_like_products(self):
@@ -828,8 +836,28 @@ class TestEinSum(object):
         b = np.einsum('bbcdc->d', a)
         assert_equal(b, [12])
 
+    def test_broadcasting_dot_cases(self):
+        # Ensures broadcasting cases are not mistaken for GEMM
 
-class TestEinSumPath(object):
+        a = np.random.rand(1, 5, 4)
+        b = np.random.rand(4, 6)
+        c = np.random.rand(5, 6)
+        d = np.random.rand(10)
+
+        self.optimize_compare('ijk,kl,jl', operands=[a, b, c])
+        self.optimize_compare('ijk,kl,jl,i->i', operands=[a, b, c, d])
+
+        e = np.random.rand(1, 1, 5, 4)
+        f = np.random.rand(7, 7)
+        self.optimize_compare('abjk,kl,jl', operands=[e, b, c])
+        self.optimize_compare('abjk,kl,jl,ab->ab', operands=[e, b, c, f])
+
+        # Edge case found in gh-11308
+        g = np.arange(64).reshape(2, 4, 8)
+        self.optimize_compare('obk,ijk->ioj', operands=[g, g])
+
+
+class TestEinsumPath(object):
     def build_operands(self, string, size_dict=global_size_dict):
 
         # Builds views based off initial operands
@@ -875,7 +903,7 @@ class TestEinSumPath(object):
         long_test1 = self.build_operands('acdf,jbje,gihb,hfac,gfac,gifabc,hfac')
         path, path_str = np.einsum_path(*long_test1, optimize='greedy')
         self.assert_path_equal(path, ['einsum_path',
-                                      (1, 4), (2, 4), (1, 4), (1, 3), (1, 2), (0, 1)])
+                                      (3, 6), (3, 4), (2, 4), (2, 3), (0, 2), (0, 1)])
 
         path, path_str = np.einsum_path(*long_test1, optimize='optimal')
         self.assert_path_equal(path, ['einsum_path',
@@ -884,10 +912,12 @@ class TestEinSumPath(object):
         # Long test 2
         long_test2 = self.build_operands('chd,bde,agbc,hiad,bdi,cgh,agdb')
         path, path_str = np.einsum_path(*long_test2, optimize='greedy')
+        print(path)
         self.assert_path_equal(path, ['einsum_path',
                                       (3, 4), (0, 3), (3, 4), (1, 3), (1, 2), (0, 1)])
 
         path, path_str = np.einsum_path(*long_test2, optimize='optimal')
+        print(path)
         self.assert_path_equal(path, ['einsum_path',
                                       (0, 5), (1, 4), (3, 4), (1, 3), (1, 2), (0, 1)])
 
@@ -921,7 +951,7 @@ class TestEinSumPath(object):
         # Edge test4
         edge_test4 = self.build_operands('dcc,fce,ea,dbf->ab')
         path, path_str = np.einsum_path(*edge_test4, optimize='greedy')
-        self.assert_path_equal(path, ['einsum_path', (0, 3), (0, 2), (0, 1)])
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 1), (0, 1)])
 
         path, path_str = np.einsum_path(*edge_test4, optimize='optimal')
         self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 2), (0, 1)])
@@ -944,7 +974,7 @@ class TestEinSumPath(object):
         self.assert_path_equal(path, ['einsum_path', (0, 1, 2, 3)])
 
         path, path_str = np.einsum_path(*path_test, optimize=True)
-        self.assert_path_equal(path, ['einsum_path', (0, 3), (0, 2), (0, 1)])
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 1), (0, 1)])
 
         exp_path = ['einsum_path', (0, 2), (0, 2), (0, 1)]
         path, path_str = np.einsum_path(*path_test, optimize=exp_path)
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index cbcd3e994..276cd9f93 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -329,6 +329,21 @@ class TestIndexing(object):
         assert_raises(IndexError, a.__getitem__, ind)
         assert_raises(IndexError, a.__setitem__, ind, 0)
 
+    def test_trivial_fancy_not_possible(self):
+        # Test that the fast path for trivial assignment is not incorrectly
+        # used when the index is not contiguous or 1D, see also gh-11467.
+        a = np.arange(6)
+        idx = np.arange(6, dtype=np.intp).reshape(2, 1, 3)[:, :, 0]
+        assert_array_equal(a[idx], idx)
+
+        # this case must not go into the fast path, note that idx is
+        # a non-contiuguous none 1D array here.
+        a[idx] = -1
+        res = np.arange(6)
+        res[0] = -1
+        res[3] = -1
+        assert_array_equal(a, res)
+
     def test_nonbaseclass_values(self):
         class SubClass(np.ndarray):
             def __array_finalize__(self, old):
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index ba4413138..62f592524 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -2391,3 +2391,15 @@ class TestRegression(object):
                 squeezed = scvalue.squeeze(axis=axis)
                 assert_equal(squeezed, scvalue)
                 assert_equal(type(squeezed), type(scvalue))
+
+    def test_field_access_by_title(self):
+        # gh-11507
+        s = 'Some long field name'
+        if HAS_REFCOUNT:
+            base = sys.getrefcount(s)
+        t = np.dtype([((s, 'f1'), np.float64)])
+        data = np.zeros(10, t)
+        for i in range(10):
+            v = str(data[['f1']])
+            if HAS_REFCOUNT:
+                assert_(base <= sys.getrefcount(s))
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 4772913be..95107b538 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1745,18 +1745,22 @@ class TestSpecialMethods(object):
                 return "B"
 
         class C(object):
+            def __init__(self):
+                self.count = 0
+
             def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                self.count += 1
                 return NotImplemented
 
         class CSub(C):
             def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                self.count += 1
                 return NotImplemented
 
         a = A()
         a_sub = ASub()
         b = B()
         c = C()
-        c_sub = CSub()
 
         # Standard
         res = np.multiply(a, a_sub)
@@ -1767,11 +1771,27 @@ class TestSpecialMethods(object):
         # With 1 NotImplemented
         res = np.multiply(c, a)
         assert_equal(res, "A")
+        assert_equal(c.count, 1)
+        # Check our counter works, so we can trust tests below.
+        res = np.multiply(c, a)
+        assert_equal(c.count, 2)
 
         # Both NotImplemented.
+        c = C()
+        c_sub = CSub()
         assert_raises(TypeError, np.multiply, c, c_sub)
+        assert_equal(c.count, 1)
+        assert_equal(c_sub.count, 1)
+        c.count = c_sub.count = 0
         assert_raises(TypeError, np.multiply, c_sub, c)
+        assert_equal(c.count, 1)
+        assert_equal(c_sub.count, 1)
+        c.count = 0
+        assert_raises(TypeError, np.multiply, c, c)
+        assert_equal(c.count, 1)
+        c.count = 0
         assert_raises(TypeError, np.multiply, 2, c)
+        assert_equal(c.count, 1)
 
         # Ternary testing.
         assert_equal(three_mul_ufunc(a, 1, 2), "A")
@@ -1783,11 +1803,19 @@ class TestSpecialMethods(object):
         assert_equal(three_mul_ufunc(a, 2, b), "A")
         assert_equal(three_mul_ufunc(a, 2, a_sub), "ASub")
         assert_equal(three_mul_ufunc(a, a_sub, 3), "ASub")
+        c.count = 0
         assert_equal(three_mul_ufunc(c, a_sub, 3), "ASub")
+        assert_equal(c.count, 1)
+        c.count = 0
         assert_equal(three_mul_ufunc(1, a_sub, c), "ASub")
+        assert_equal(c.count, 0)
 
+        c.count = 0
         assert_equal(three_mul_ufunc(a, b, c), "A")
+        assert_equal(c.count, 0)
+        c_sub.count = 0
         assert_equal(three_mul_ufunc(a, b, c_sub), "A")
+        assert_equal(c_sub.count, 0)
         assert_equal(three_mul_ufunc(1, 2, b), "B")
 
         assert_raises(TypeError, three_mul_ufunc, 1, 2, c)
@@ -1806,9 +1834,25 @@ class TestSpecialMethods(object):
         assert_equal(four_mul_ufunc(a_sub, 1, 2, a), "ASub")
         assert_equal(four_mul_ufunc(a, 1, 2, a_sub), "ASub")
 
+        c = C()
+        c_sub = CSub()
         assert_raises(TypeError, four_mul_ufunc, 1, 2, 3, c)
+        assert_equal(c.count, 1)
+        c.count = 0
         assert_raises(TypeError, four_mul_ufunc, 1, 2, c_sub, c)
-        assert_raises(TypeError, four_mul_ufunc, 1, c, c_sub, c)
+        assert_equal(c_sub.count, 1)
+        assert_equal(c.count, 1)
+        c2 = C()
+        c.count = c_sub.count = 0
+        assert_raises(TypeError, four_mul_ufunc, 1, c, c_sub, c2)
+        assert_equal(c_sub.count, 1)
+        assert_equal(c.count, 1)
+        assert_equal(c2.count, 0)
+        c.count = c2.count = c_sub.count = 0
+        assert_raises(TypeError, four_mul_ufunc, c2, c, c_sub, c)
+        assert_equal(c_sub.count, 1)
+        assert_equal(c.count, 0)
+        assert_equal(c2.count, 1)
 
     def test_ufunc_override_methods(self):
 
diff --git a/numpy/distutils/__init__.py b/numpy/distutils/__init__.py
index b794bebd7..8dd326920 100644
--- a/numpy/distutils/__init__.py
+++ b/numpy/distutils/__init__.py
@@ -17,7 +17,7 @@ try:
     # Normally numpy is installed if the above import works, but an interrupted
     # in-place build could also have left a __config__.py.  In that case the
     # next import may still fail, so keep it inside the try block.
-    from numpy.testing._private.pytesttester import PytestTester
+    from numpy._pytesttester import PytestTester
     test = PytestTester(__name__)
     del PytestTester
 except ImportError:
diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py
index c926e7378..3bd8057b4 100644
--- a/numpy/distutils/fcompiler/__init__.py
+++ b/numpy/distutils/fcompiler/__init__.py
@@ -35,10 +35,11 @@ from numpy.distutils.ccompiler import CCompiler, gen_lib_options
 from numpy.distutils import log
 from numpy.distutils.misc_util import is_string, all_strings, is_sequence, \
     make_temp_file, get_shared_lib_extension
-from numpy.distutils.environment import EnvironmentConfig
 from numpy.distutils.exec_command import find_executable
 from numpy.distutils.compat import get_exception
 
+from .environment import EnvironmentConfig
+
 __metaclass__ = type
 
 class CompilerNotFound(Exception):
@@ -91,7 +92,7 @@ class FCompiler(CCompiler):
 
     # These are the environment variables and distutils keys used.
     # Each configuration description is
-    # (<hook name>, <environment variable>, <key in distutils.cfg>, <convert>)
+    # (<hook name>, <environment variable>, <key in distutils.cfg>, <convert>, <append>)
     # The hook names are handled by the self._environment_hook method.
     #  - names starting with 'self.' call methods in this class
     #  - names starting with 'exe.' return the key in the executables dict
@@ -101,43 +102,43 @@ class FCompiler(CCompiler):
 
     distutils_vars = EnvironmentConfig(
         distutils_section='config_fc',
-        noopt = (None, None, 'noopt', str2bool),
-        noarch = (None, None, 'noarch', str2bool),
-        debug = (None, None, 'debug', str2bool),
-        verbose = (None, None, 'verbose', str2bool),
+        noopt = (None, None, 'noopt', str2bool, False),
+        noarch = (None, None, 'noarch', str2bool, False),
+        debug = (None, None, 'debug', str2bool, False),
+        verbose = (None, None, 'verbose', str2bool, False),
     )
 
     command_vars = EnvironmentConfig(
         distutils_section='config_fc',
-        compiler_f77 = ('exe.compiler_f77', 'F77', 'f77exec', None),
-        compiler_f90 = ('exe.compiler_f90', 'F90', 'f90exec', None),
-        compiler_fix = ('exe.compiler_fix', 'F90', 'f90exec', None),
-        version_cmd = ('exe.version_cmd', None, None, None),
-        linker_so = ('exe.linker_so', 'LDSHARED', 'ldshared', None),
-        linker_exe = ('exe.linker_exe', 'LD', 'ld', None),
-        archiver = (None, 'AR', 'ar', None),
-        ranlib = (None, 'RANLIB', 'ranlib', None),
+        compiler_f77 = ('exe.compiler_f77', 'F77', 'f77exec', None, False),
+        compiler_f90 = ('exe.compiler_f90', 'F90', 'f90exec', None, False),
+        compiler_fix = ('exe.compiler_fix', 'F90', 'f90exec', None, False),
+        version_cmd = ('exe.version_cmd', None, None, None, False),
+        linker_so = ('exe.linker_so', 'LDSHARED', 'ldshared', None, False),
+        linker_exe = ('exe.linker_exe', 'LD', 'ld', None, False),
+        archiver = (None, 'AR', 'ar', None, False),
+        ranlib = (None, 'RANLIB', 'ranlib', None, False),
     )
 
     flag_vars = EnvironmentConfig(
         distutils_section='config_fc',
-        f77 = ('flags.f77', 'F77FLAGS', 'f77flags', flaglist),
-        f90 = ('flags.f90', 'F90FLAGS', 'f90flags', flaglist),
-        free = ('flags.free', 'FREEFLAGS', 'freeflags', flaglist),
-        fix = ('flags.fix', None, None, flaglist),
-        opt = ('flags.opt', 'FOPT', 'opt', flaglist),
-        opt_f77 = ('flags.opt_f77', None, None, flaglist),
-        opt_f90 = ('flags.opt_f90', None, None, flaglist),
-        arch = ('flags.arch', 'FARCH', 'arch', flaglist),
-        arch_f77 = ('flags.arch_f77', None, None, flaglist),
-        arch_f90 = ('flags.arch_f90', None, None, flaglist),
-        debug = ('flags.debug', 'FDEBUG', 'fdebug', flaglist),
-        debug_f77 = ('flags.debug_f77', None, None, flaglist),
-        debug_f90 = ('flags.debug_f90', None, None, flaglist),
-        flags = ('self.get_flags', 'FFLAGS', 'fflags', flaglist),
-        linker_so = ('flags.linker_so', 'LDFLAGS', 'ldflags', flaglist),
-        linker_exe = ('flags.linker_exe', 'LDFLAGS', 'ldflags', flaglist),
-        ar = ('flags.ar', 'ARFLAGS', 'arflags', flaglist),
+        f77 = ('flags.f77', 'F77FLAGS', 'f77flags', flaglist, True),
+        f90 = ('flags.f90', 'F90FLAGS', 'f90flags', flaglist, True),
+        free = ('flags.free', 'FREEFLAGS', 'freeflags', flaglist, True),
+        fix = ('flags.fix', None, None, flaglist, False),
+        opt = ('flags.opt', 'FOPT', 'opt', flaglist, True),
+        opt_f77 = ('flags.opt_f77', None, None, flaglist, False),
+        opt_f90 = ('flags.opt_f90', None, None, flaglist, False),
+        arch = ('flags.arch', 'FARCH', 'arch', flaglist, False),
+        arch_f77 = ('flags.arch_f77', None, None, flaglist, False),
+        arch_f90 = ('flags.arch_f90', None, None, flaglist, False),
+        debug = ('flags.debug', 'FDEBUG', 'fdebug', flaglist, True),
+        debug_f77 = ('flags.debug_f77', None, None, flaglist, False),
+        debug_f90 = ('flags.debug_f90', None, None, flaglist, False),
+        flags = ('self.get_flags', 'FFLAGS', 'fflags', flaglist, True),
+        linker_so = ('flags.linker_so', 'LDFLAGS', 'ldflags', flaglist, True),
+        linker_exe = ('flags.linker_exe', 'LDFLAGS', 'ldflags', flaglist, True),
+        ar = ('flags.ar', 'ARFLAGS', 'arflags', flaglist, True),
     )
 
     language_map = {'.f': 'f77',
diff --git a/numpy/distutils/environment.py b/numpy/distutils/fcompiler/environment.py
index 3798e16f5..489784580 100644
--- a/numpy/distutils/environment.py
+++ b/numpy/distutils/fcompiler/environment.py
@@ -14,7 +14,7 @@ class EnvironmentConfig(object):
 
     def dump_variable(self, name):
         conf_desc = self._conf_keys[name]
-        hook, envvar, confvar, convert = conf_desc
+        hook, envvar, confvar, convert, append = conf_desc
         if not convert:
             convert = lambda x : x
         print('%s.%s:' % (self._distutils_section, name))
@@ -49,10 +49,15 @@ class EnvironmentConfig(object):
         return var
 
     def _get_var(self, name, conf_desc):
-        hook, envvar, confvar, convert = conf_desc
+        hook, envvar, confvar, convert, append = conf_desc
         var = self._hook_handler(name, hook)
         if envvar is not None:
-            var = os.environ.get(envvar, var)
+            envvar_contents = os.environ.get(envvar)
+            if envvar_contents is not None:
+                if var and append and os.environ.get('NPY_DISTUTILS_APPEND_FLAGS', '0') == '1':
+                    var = var + [envvar_contents]
+                else:
+                    var = envvar_contents
         if confvar is not None and self._conf:
             var = self._conf.get(confvar, (None, var))[1]
         if convert is not None:
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index 186ed949d..8305aeae5 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -2300,19 +2300,9 @@ import sys
 
 extra_dll_dir = os.path.join(os.path.dirname(__file__), '.libs')
 
-if os.path.isdir(extra_dll_dir) and sys.platform == 'win32':
-    try:
-        from ctypes import windll, c_wchar_p
-        _AddDllDirectory = windll.kernel32.AddDllDirectory
-        _AddDllDirectory.argtypes = [c_wchar_p]
-        # Needed to initialize AddDllDirectory modifications
-        windll.kernel32.SetDefaultDllDirectories(0x1000)
-    except AttributeError:
-        def _AddDllDirectory(dll_directory):
-            os.environ.setdefault('PATH', '')
-            os.environ['PATH'] += os.pathsep + dll_directory
-
-    _AddDllDirectory(extra_dll_dir)
+if sys.platform == 'win32' and os.path.isdir(extra_dll_dir):
+    os.environ.setdefault('PATH', '')
+    os.environ['PATH'] += os.pathsep + extra_dll_dir
 
 """)
 
diff --git a/numpy/distutils/tests/test_fcompiler.py b/numpy/distutils/tests/test_fcompiler.py
new file mode 100644
index 000000000..95e44b051
--- /dev/null
+++ b/numpy/distutils/tests/test_fcompiler.py
@@ -0,0 +1,44 @@
+from __future__ import division, absolute_import, print_function
+
+from numpy.testing import assert_
+import numpy.distutils.fcompiler
+
+customizable_flags = [
+    ('f77', 'F77FLAGS'),
+    ('f90', 'F90FLAGS'),
+    ('free', 'FREEFLAGS'),
+    ('arch', 'FARCH'),
+    ('debug', 'FDEBUG'),
+    ('flags', 'FFLAGS'),
+    ('linker_so', 'LDFLAGS'),
+]
+
+
+def test_fcompiler_flags(monkeypatch):
+    monkeypatch.setenv('NPY_DISTUTILS_APPEND_FLAGS', '0')
+    fc = numpy.distutils.fcompiler.new_fcompiler(compiler='none')
+    flag_vars = fc.flag_vars.clone(lambda *args, **kwargs: None)
+
+    for opt, envvar in customizable_flags:
+        new_flag = '-dummy-{}-flag'.format(opt)
+        prev_flags = getattr(flag_vars, opt)
+
+        monkeypatch.setenv(envvar, new_flag)
+        new_flags = getattr(flag_vars, opt)
+        monkeypatch.delenv(envvar)
+        assert_(new_flags == [new_flag])
+
+    monkeypatch.setenv('NPY_DISTUTILS_APPEND_FLAGS', '1')
+
+    for opt, envvar in customizable_flags:
+        new_flag = '-dummy-{}-flag'.format(opt)
+        prev_flags = getattr(flag_vars, opt)
+
+        monkeypatch.setenv(envvar, new_flag)
+        new_flags = getattr(flag_vars, opt)
+        monkeypatch.delenv(envvar)
+        if prev_flags is None:
+            assert_(new_flags == [new_flag])
+        else:
+            assert_(new_flags == prev_flags + [new_flag])
+
diff --git a/numpy/f2py/__init__.py b/numpy/f2py/__init__.py
index 5075c682d..fbb64f762 100644
--- a/numpy/f2py/__init__.py
+++ b/numpy/f2py/__init__.py
@@ -69,6 +69,6 @@ def compile(source,
         f.close()
     return status
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/fft/__init__.py b/numpy/fft/__init__.py
index bbb6ec8c7..44243b483 100644
--- a/numpy/fft/__init__.py
+++ b/numpy/fft/__init__.py
@@ -6,6 +6,6 @@ from .info import __doc__
 from .fftpack import *
 from .helper import *
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/lib/__init__.py b/numpy/lib/__init__.py
index d764cdc7e..dc40ac67b 100644
--- a/numpy/lib/__init__.py
+++ b/numpy/lib/__init__.py
@@ -46,6 +46,6 @@ __all__ += financial.__all__
 __all__ += nanfunctions.__all__
 __all__ += histograms.__all__
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index 4d3f35183..5880ea154 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -607,6 +607,14 @@ def isin(element, test_elements, assume_unique=False, invert=False):
            [ True,  False]])
     >>> element[mask]
     array([2, 4])
+
+    The indices of the matched values can be obtained with `nonzero`:
+
+    >>> np.nonzero(mask)
+    (array([0, 1]), array([1, 0]))
+
+    The test can also be inverted:
+
     >>> mask = np.isin(element, test_elements, invert=True)
     >>> mask
     array([[ True, False],
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 128da22c6..1a43da8b0 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -27,10 +27,11 @@ from numpy.core.fromnumeric import (
     ravel, nonzero, sort, partition, mean, any, sum
     )
 from numpy.core.numerictypes import typecodes, number
+from numpy.core.function_base import add_newdoc
 from numpy.lib.twodim_base import diag
 from .utils import deprecate
 from numpy.core.multiarray import (
-    _insert, add_docstring, digitize, bincount, normalize_axis_index,
+    _insert, add_docstring, bincount, normalize_axis_index, _monotonicity,
     interp as compiled_interp, interp_complex as compiled_interp_complex
     )
 from numpy.core.umath import _add_newdoc_ufunc as add_newdoc_ufunc
@@ -3398,9 +3399,9 @@ def _median(a, axis=None, out=None, overwrite_input=False):
 def percentile(a, q, axis=None, out=None,
                overwrite_input=False, interpolation='linear', keepdims=False):
     """
-    Compute the qth percentile of the data along the specified axis.
+    Compute the q-th percentile of the data along the specified axis.
 
-    Returns the qth percentile(s) of the array elements.
+    Returns the q-th percentile(s) of the array elements.
 
     Parameters
     ----------
@@ -3467,7 +3468,7 @@ def percentile(a, q, axis=None, out=None,
 
     Notes
     -----
-    Given a vector ``V`` of length ``N``, the ``q``-th percentile of
+    Given a vector ``V`` of length ``N``, the q-th percentile of
     ``V`` is the value ``q/100`` of the way from the minimum to the
     maximum in a sorted copy of ``V``. The values and distances of
     the two nearest neighbors as well as the `interpolation` parameter
@@ -3543,7 +3544,7 @@ def percentile(a, q, axis=None, out=None,
 def quantile(a, q, axis=None, out=None,
              overwrite_input=False, interpolation='linear', keepdims=False):
     """
-    Compute the `q`th quantile of the data along the specified axis.
+    Compute the q-th quantile of the data along the specified axis.
     ..versionadded:: 1.15.0
 
     Parameters
@@ -3569,6 +3570,7 @@ def quantile(a, q, axis=None, out=None,
         This optional parameter specifies the interpolation method to
         use when the desired quantile lies between two data points
         ``i < j``:
+
             * linear: ``i + (j - i) * fraction``, where ``fraction``
               is the fractional part of the index surrounded by ``i``
               and ``j``.
@@ -3602,7 +3604,7 @@ def quantile(a, q, axis=None, out=None,
 
     Notes
     -----
-    Given a vector ``V`` of length ``N``, the ``q``-th quantile of
+    Given a vector ``V`` of length ``N``, the q-th quantile of
     ``V`` is the value ``q`` of the way from the minimum to the
     maximum in a sorted copy of ``V``. The values and distances of
     the two nearest neighbors as well as the `interpolation` parameter
@@ -3720,7 +3722,7 @@ def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False,
             indices = concatenate((indices, [-1]))
 
         ap.partition(indices, axis=axis)
-        # ensure axis with qth is first
+        # ensure axis with q-th is first
         ap = np.moveaxis(ap, axis, 0)
         axis = 0
 
@@ -3753,7 +3755,7 @@ def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False,
 
         ap.partition(concatenate((indices_below, indices_above)), axis=axis)
 
-        # ensure axis with qth is first
+        # ensure axis with q-th is first
         ap = np.moveaxis(ap, axis, 0)
         weights_below = np.moveaxis(weights_below, axis, 0)
         weights_above = np.moveaxis(weights_above, axis, 0)
@@ -3767,7 +3769,7 @@ def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False,
         x1 = take(ap, indices_below, axis=axis) * weights_below
         x2 = take(ap, indices_above, axis=axis) * weights_above
 
-        # ensure axis with qth is first
+        # ensure axis with q-th is first
         x1 = np.moveaxis(x1, axis, 0)
         x2 = np.moveaxis(x2, axis, 0)
 
@@ -3891,41 +3893,6 @@ def trapz(y, x=None, dx=1.0, axis=-1):
     return ret
 
 
-#always succeed
-def add_newdoc(place, obj, doc):
-    """
-    Adds documentation to obj which is in module place.
-
-    If doc is a string add it to obj as a docstring
-
-    If doc is a tuple, then the first element is interpreted as
-       an attribute of obj and the second as the docstring
-          (method, docstring)
-
-    If doc is a list, then each element of the list should be a
-       sequence of length two --> [(method1, docstring1),
-       (method2, docstring2), ...]
-
-    This routine never raises an error.
-
-    This routine cannot modify read-only docstrings, as appear
-    in new-style classes or built-in functions. Because this
-    routine never raises an error the caller must check manually
-    that the docstrings were changed.
-    """
-    try:
-        new = getattr(__import__(place, globals(), {}, [obj]), obj)
-        if isinstance(doc, str):
-            add_docstring(new, doc.strip())
-        elif isinstance(doc, tuple):
-            add_docstring(getattr(new, doc[0]), doc[1].strip())
-        elif isinstance(doc, list):
-            for val in doc:
-                add_docstring(getattr(new, val[0]), val[1].strip())
-    except Exception:
-        pass
-
-
 # Based on scitools meshgrid
 def meshgrid(*xi, **kwargs):
     """
@@ -4526,3 +4493,113 @@ def append(arr, values, axis=None):
         values = ravel(values)
         axis = arr.ndim-1
     return concatenate((arr, values), axis=axis)
+
+
+def digitize(x, bins, right=False):
+    """
+    Return the indices of the bins to which each value in input array belongs.
+
+    =========  =============  ============================
+    `right`    order of bins  returned index `i` satisfies
+    =========  =============  ============================
+    ``False``  increasing     ``bins[i-1] <= x < bins[i]``
+    ``True``   increasing     ``bins[i-1] < x <= bins[i]``
+    ``False``  decreasing     ``bins[i-1] > x >= bins[i]``
+    ``True``   decreasing     ``bins[i-1] >= x > bins[i]``
+    =========  =============  ============================
+
+    If values in `x` are beyond the bounds of `bins`, 0 or ``len(bins)`` is
+    returned as appropriate.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array to be binned. Prior to NumPy 1.10.0, this array had to
+        be 1-dimensional, but can now have any shape.
+    bins : array_like
+        Array of bins. It has to be 1-dimensional and monotonic.
+    right : bool, optional
+        Indicating whether the intervals include the right or the left bin
+        edge. Default behavior is (right==False) indicating that the interval
+        does not include the right edge. The left bin end is open in this
+        case, i.e., bins[i-1] <= x < bins[i] is the default behavior for
+        monotonically increasing bins.
+
+    Returns
+    -------
+    indices : ndarray of ints
+        Output array of indices, of same shape as `x`.
+
+    Raises
+    ------
+    ValueError
+        If `bins` is not monotonic.
+    TypeError
+        If the type of the input is complex.
+
+    See Also
+    --------
+    bincount, histogram, unique, searchsorted
+
+    Notes
+    -----
+    If values in `x` are such that they fall outside the bin range,
+    attempting to index `bins` with the indices that `digitize` returns
+    will result in an IndexError.
+
+    .. versionadded:: 1.10.0
+
+    `np.digitize` is  implemented in terms of `np.searchsorted`. This means
+    that a binary search is used to bin the values, which scales much better
+    for larger number of bins than the previous linear search. It also removes
+    the requirement for the input array to be 1-dimensional.
+
+    For monotonically _increasing_ `bins`, the following are equivalent::
+
+        np.digitize(x, bins, right=True)
+        np.searchsorted(bins, x, side='left')
+
+    Note that as the order of the arguments are reversed, the side must be too.
+    The `searchsorted` call is marginally faster, as it does not do any
+    monotonicity checks. Perhaps more importantly, it supports all dtypes.
+
+    Examples
+    --------
+    >>> x = np.array([0.2, 6.4, 3.0, 1.6])
+    >>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
+    >>> inds = np.digitize(x, bins)
+    >>> inds
+    array([1, 4, 3, 2])
+    >>> for n in range(x.size):
+    ...   print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]])
+    ...
+    0.0 <= 0.2 < 1.0
+    4.0 <= 6.4 < 10.0
+    2.5 <= 3.0 < 4.0
+    1.0 <= 1.6 < 2.5
+
+    >>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.])
+    >>> bins = np.array([0, 5, 10, 15, 20])
+    >>> np.digitize(x,bins,right=True)
+    array([1, 2, 3, 4, 4])
+    >>> np.digitize(x,bins,right=False)
+    array([1, 3, 3, 4, 5])
+    """
+    x = _nx.asarray(x)
+    bins = _nx.asarray(bins)
+
+    # here for compatibility, searchsorted below is happy to take this
+    if np.issubdtype(x.dtype, _nx.complexfloating):
+        raise TypeError("x may not be complex")
+
+    mono = _monotonicity(bins)
+    if mono == 0:
+        raise ValueError("bins must be monotonically increasing or decreasing")
+
+    # this is backwards because the arguments below are swapped
+    side = 'left' if right else 'right'
+    if mono == -1:
+        # reverse the bins, and invert the results
+        return len(bins) - _nx.searchsorted(bins[::-1], x, side=side)
+    else:
+        return _nx.searchsorted(bins, x, side=side)
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 337957dd5..422b356f7 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -782,7 +782,7 @@ def histogram(a, bins=10, range=None, normed=None, weights=None,
                     "The normed argument is ignored when density is provided. "
                     "In future passing both will result in an error.",
                     DeprecationWarning, stacklevel=2)
-        normed = False
+        normed = None
 
     if density:
         db = np.array(np.diff(bin_edges), float)
@@ -812,7 +812,8 @@ def histogram(a, bins=10, range=None, normed=None, weights=None,
         return n, bin_edges
 
 
-def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
+def histogramdd(sample, bins=10, range=None, normed=None, weights=None,
+                density=None):
     """
     Compute the multidimensional histogram of some data.
 
@@ -845,9 +846,14 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
         An entry of None in the sequence results in the minimum and maximum
         values being used for the corresponding dimension.
         The default, None, is equivalent to passing a tuple of D None values.
+    density : bool, optional
+        If False, the default, returns the number of samples in each bin.
+        If True, returns the probability *density* function at the bin,
+        ``bin_count / sample_count / bin_volume``.
     normed : bool, optional
-        If False, returns the number of samples in each bin. If True,
-        returns the bin density ``bin_count / sample_count / bin_volume``.
+        An alias for the density argument that behaves identically. To avoid
+        confusion with the broken normed argument to `histogram`, `density`
+        should be preferred.
     weights : (N,) array_like, optional
         An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.
         Weights are normalized to 1 if normed is True. If normed is False,
@@ -961,8 +967,18 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
     core = D*(slice(1, -1),)
     hist = hist[core]
 
-    # Normalize if normed is True
-    if normed:
+    # handle the aliasing normed argument
+    if normed is None:
+        if density is None:
+            density = False
+    elif density is None:
+        # an explicit normed argument was passed, alias it to the new name
+        density = normed
+    else:
+        raise TypeError("Cannot specify both 'normed' and 'density'")
+
+    if density:
+        # calculate the probability density function
         s = hist.sum()
         for i in _range(D):
             shape = np.ones(D, int)
diff --git a/numpy/lib/index_tricks.py b/numpy/lib/index_tricks.py
index d2139338e..009e6d229 100644
--- a/numpy/lib/index_tricks.py
+++ b/numpy/lib/index_tricks.py
@@ -121,39 +121,13 @@ class nd_grid(object):
     Notes
     -----
     Two instances of `nd_grid` are made available in the NumPy namespace,
-    `mgrid` and `ogrid`::
+    `mgrid` and `ogrid`, approximately defined as::
 
         mgrid = nd_grid(sparse=False)
         ogrid = nd_grid(sparse=True)
 
     Users should use these pre-defined instances instead of using `nd_grid`
     directly.
-
-    Examples
-    --------
-    >>> mgrid = np.lib.index_tricks.nd_grid()
-    >>> mgrid[0:5,0:5]
-    array([[[0, 0, 0, 0, 0],
-            [1, 1, 1, 1, 1],
-            [2, 2, 2, 2, 2],
-            [3, 3, 3, 3, 3],
-            [4, 4, 4, 4, 4]],
-           [[0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4]]])
-    >>> mgrid[-1:1:5j]
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-
-    >>> ogrid = np.lib.index_tricks.nd_grid(sparse=True)
-    >>> ogrid[0:5,0:5]
-    [array([[0],
-            [1],
-            [2],
-            [3],
-            [4]]), array([[0, 1, 2, 3, 4]])]
-
     """
 
     def __init__(self, sparse=False):
@@ -223,10 +197,97 @@ class nd_grid(object):
     def __len__(self):
         return 0
 
-mgrid = nd_grid(sparse=False)
-ogrid = nd_grid(sparse=True)
-mgrid.__doc__ = None  # set in numpy.add_newdocs
-ogrid.__doc__ = None  # set in numpy.add_newdocs
+
+class MGridClass(nd_grid):
+    """
+    `nd_grid` instance which returns a dense multi-dimensional "meshgrid".
+
+    An instance of `numpy.lib.index_tricks.nd_grid` which returns an dense
+    (or fleshed out) mesh-grid when indexed, so that each returned argument
+    has the same shape.  The dimensions and number of the output arrays are
+    equal to the number of indexing dimensions.  If the step length is not a
+    complex number, then the stop is not inclusive.
+
+    However, if the step length is a **complex number** (e.g. 5j), then
+    the integer part of its magnitude is interpreted as specifying the
+    number of points to create between the start and stop values, where
+    the stop value **is inclusive**.
+
+    Returns
+    ----------
+    mesh-grid `ndarrays` all of the same dimensions
+
+    See Also
+    --------
+    numpy.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
+    ogrid : like mgrid but returns open (not fleshed out) mesh grids
+    r_ : array concatenator
+
+    Examples
+    --------
+    >>> np.mgrid[0:5,0:5]
+    array([[[0, 0, 0, 0, 0],
+            [1, 1, 1, 1, 1],
+            [2, 2, 2, 2, 2],
+            [3, 3, 3, 3, 3],
+            [4, 4, 4, 4, 4]],
+           [[0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4]]])
+    >>> np.mgrid[-1:1:5j]
+    array([-1. , -0.5,  0. ,  0.5,  1. ])
+
+    """
+    def __init__(self):
+        super(MGridClass, self).__init__(sparse=False)
+
+mgrid = MGridClass()
+
+class OGridClass(nd_grid):
+    """
+    `nd_grid` instance which returns an open multi-dimensional "meshgrid".
+
+    An instance of `numpy.lib.index_tricks.nd_grid` which returns an open
+    (i.e. not fleshed out) mesh-grid when indexed, so that only one dimension
+    of each returned array is greater than 1.  The dimension and number of the
+    output arrays are equal to the number of indexing dimensions.  If the step
+    length is not a complex number, then the stop is not inclusive.
+
+    However, if the step length is a **complex number** (e.g. 5j), then
+    the integer part of its magnitude is interpreted as specifying the
+    number of points to create between the start and stop values, where
+    the stop value **is inclusive**.
+
+    Returns
+    ----------
+    mesh-grid `ndarrays` with only one dimension :math:`\\neq 1`
+
+    See Also
+    --------
+    np.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
+    mgrid : like `ogrid` but returns dense (or fleshed out) mesh grids
+    r_ : array concatenator
+
+    Examples
+    --------
+    >>> from numpy import ogrid
+    >>> ogrid[-1:1:5j]
+    array([-1. , -0.5,  0. ,  0.5,  1. ])
+    >>> ogrid[0:5,0:5]
+    [array([[0],
+            [1],
+            [2],
+            [3],
+            [4]]), array([[0, 1, 2, 3, 4]])]
+
+    """
+    def __init__(self):
+        super(OGridClass, self).__init__(sparse=True)
+
+ogrid = OGridClass()
+
 
 class AxisConcatenator(object):
     """
diff --git a/numpy/lib/stride_tricks.py b/numpy/lib/stride_tricks.py
index 2abe5cdd1..bc5993802 100644
--- a/numpy/lib/stride_tricks.py
+++ b/numpy/lib/stride_tricks.py
@@ -219,23 +219,19 @@ def broadcast_arrays(*args, **kwargs):
     Examples
     --------
     >>> x = np.array([[1,2,3]])
-    >>> y = np.array([[1],[2],[3]])
+    >>> y = np.array([[4],[5]])
     >>> np.broadcast_arrays(x, y)
     [array([[1, 2, 3],
-           [1, 2, 3],
-           [1, 2, 3]]), array([[1, 1, 1],
-           [2, 2, 2],
-           [3, 3, 3]])]
+           [1, 2, 3]]), array([[4, 4, 4],
+           [5, 5, 5]])]
 
     Here is a useful idiom for getting contiguous copies instead of
     non-contiguous views.
 
     >>> [np.array(a) for a in np.broadcast_arrays(x, y)]
     [array([[1, 2, 3],
-           [1, 2, 3],
-           [1, 2, 3]]), array([[1, 1, 1],
-           [2, 2, 2],
-           [3, 3, 3]])]
+           [1, 2, 3]]), array([[4, 4, 4],
+           [5, 5, 5]])]
 
     """
     # nditer is not used here to avoid the limit of 32 arrays.
diff --git a/numpy/lib/tests/test_arraypad.py b/numpy/lib/tests/test_arraypad.py
index 8ba0370b0..45d624781 100644
--- a/numpy/lib/tests/test_arraypad.py
+++ b/numpy/lib/tests/test_arraypad.py
@@ -1009,6 +1009,21 @@ class TestUnicodeInput(object):
         assert_array_equal(a, b)
 
 
+class TestObjectInput(object):
+    def test_object_input(self):
+        # Regression test for issue gh-11395.
+        a = np.full((4, 3), None)
+        pad_amt = ((2, 3), (3, 2))
+        b = np.full((9, 8), None)
+        modes = ['edge',
+                 'symmetric',
+                 'reflect',
+                 'wrap',
+                 ]
+        for mode in modes:
+            assert_array_equal(pad(a, pad_amt, mode=mode), b)
+
+
 class TestValueError1(object):
     def test_check_simple(self):
         arr = np.arange(30)
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 4103a9eb3..ba5b90e8c 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1510,6 +1510,18 @@ class TestDigitize(object):
         assert_(not isinstance(digitize(b, a, False), A))
         assert_(not isinstance(digitize(b, a, True), A))
 
+    def test_large_integers_increasing(self):
+        # gh-11022
+        x = 2**54  # loses precision in a float
+        assert_equal(np.digitize(x, [x - 1, x + 1]), 1)
+
+    @pytest.mark.xfail(
+        reason="gh-11022: np.core.multiarray._monoticity loses precision")
+    def test_large_integers_decreasing(self):
+        # gh-11022
+        x = 2**54  # loses precision in a float
+        assert_equal(np.digitize(x, [x + 1, x - 1]), 1)
+
 
 class TestUnwrap(object):
 
@@ -2237,6 +2249,14 @@ class TestInterp(object):
         x0 = np.nan
         assert_almost_equal(np.interp(x0, x, y), x0)
 
+    def test_non_finite_behavior(self):
+        x = [1, 2, 2.5, 3, 4]
+        xp = [1, 2, 3, 4]
+        fp = [1, 2, np.inf, 4]
+        assert_almost_equal(np.interp(x, xp, fp), [1, 2, np.inf, np.inf, 4])
+        fp = [1, 2, np.nan, 4]
+        assert_almost_equal(np.interp(x, xp, fp), [1, 2, np.nan, np.nan, 4])
+
     def test_complex_interp(self):
         # test complex interpolation
         x = np.linspace(0, 1, 5)
@@ -2251,6 +2271,12 @@ class TestInterp(object):
         x0 = 2.0
         right = 2 + 3.0j
         assert_almost_equal(np.interp(x0, x, y, right=right), right)
+        # test complex non finite
+        x = [1, 2, 2.5, 3, 4]
+        xp = [1, 2, 3, 4]
+        fp = [1, 2+1j, np.inf, 4]
+        y = [1, 2+1j, np.inf+0.5j, np.inf, 4]
+        assert_almost_equal(np.interp(x, xp, fp), y)
         # test complex periodic
         x = [-180, -170, -185, 185, -10, -5, 0, 365]
         xp = [190, -190, 350, -350]
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 9bea2aca8..f136b5c81 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -78,6 +78,10 @@ class TestHistogram(object):
         assert_array_equal(a, .1)
         assert_equal(np.sum(a * np.diff(b)), 1)
 
+        # Test that passing False works too
+        a, b = histogram(v, bins, density=False)
+        assert_array_equal(a, [1, 2, 3, 4])
+
         # Variale bin widths are especially useful to deal with
         # infinities.
         v = np.arange(10)
@@ -543,13 +547,13 @@ class TestHistogramdd(object):
 
         # Check normalization
         ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]]
-        H, edges = histogramdd(x, bins=ed, normed=True)
+        H, edges = histogramdd(x, bins=ed, density=True)
         assert_(np.all(H == answer / 12.))
 
         # Check that H has the correct shape.
         H, edges = histogramdd(x, (2, 3, 4),
                                range=[[-1, 1], [0, 3], [0, 4]],
-                               normed=True)
+                               density=True)
         answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]],
                            [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]])
         assert_array_almost_equal(H, answer / 6., 4)
@@ -595,10 +599,10 @@ class TestHistogramdd(object):
     def test_weights(self):
         v = np.random.rand(100, 2)
         hist, edges = histogramdd(v)
-        n_hist, edges = histogramdd(v, normed=True)
+        n_hist, edges = histogramdd(v, density=True)
         w_hist, edges = histogramdd(v, weights=np.ones(100))
         assert_array_equal(w_hist, hist)
-        w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, normed=True)
+        w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, density=True)
         assert_array_equal(w_hist, n_hist)
         w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2)
         assert_array_equal(w_hist, 2 * hist)
@@ -704,7 +708,7 @@ class TestHistogramdd(object):
 
         assert_equal(hist[0, 0], 1)
 
-    def test_normed_non_uniform_2d(self):
+    def test_density_non_uniform_2d(self):
         # Defines the following grid:
         #
         #    0 2     8
@@ -728,14 +732,14 @@ class TestHistogramdd(object):
         assert_equal(hist, relative_areas)
 
         # resulting histogram should be uniform, since counts and areas are propotional
-        hist, edges = histogramdd((y, x), bins=(y_edges, x_edges), normed=True)
+        hist, edges = histogramdd((y, x), bins=(y_edges, x_edges), density=True)
         assert_equal(hist, 1 / (8*8))
 
-    def test_normed_non_uniform_1d(self):
+    def test_density_non_uniform_1d(self):
         # compare to histogram to show the results are the same
         v = np.arange(10)
         bins = np.array([0, 1, 3, 6, 10])
         hist, edges = histogram(v, bins, density=True)
-        hist_dd, edges_dd = histogramdd((v,), (bins,), normed=True)
+        hist_dd, edges_dd = histogramdd((v,), (bins,), density=True)
         assert_equal(hist, hist_dd)
         assert_equal(edges, edges_dd[0])
diff --git a/numpy/lib/tests/test_twodim_base.py b/numpy/lib/tests/test_twodim_base.py
index d3a072af3..bf93b4adb 100644
--- a/numpy/lib/tests/test_twodim_base.py
+++ b/numpy/lib/tests/test_twodim_base.py
@@ -208,7 +208,7 @@ class TestHistogram2d(object):
         x = array([1, 1, 2, 3, 4, 4, 4, 5])
         y = array([1, 3, 2, 0, 1, 2, 3, 4])
         H, xed, yed = histogram2d(
-            x, y, (6, 5), range=[[0, 6], [0, 5]], normed=True)
+            x, y, (6, 5), range=[[0, 6], [0, 5]], density=True)
         answer = array(
             [[0., 0, 0, 0, 0],
              [0, 1, 0, 1, 0],
@@ -220,11 +220,11 @@ class TestHistogram2d(object):
         assert_array_equal(xed, np.linspace(0, 6, 7))
         assert_array_equal(yed, np.linspace(0, 5, 6))
 
-    def test_norm(self):
+    def test_density(self):
         x = array([1, 2, 3, 1, 2, 3, 1, 2, 3])
         y = array([1, 1, 1, 2, 2, 2, 3, 3, 3])
         H, xed, yed = histogram2d(
-            x, y, [[1, 2, 3, 5], [1, 2, 3, 5]], normed=True)
+            x, y, [[1, 2, 3, 5], [1, 2, 3, 5]], density=True)
         answer = array([[1, 1, .5],
                         [1, 1, .5],
                         [.5, .5, .25]])/9.
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index cca316e9a..98efba191 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -530,7 +530,8 @@ def vander(x, N=None, increasing=False):
     return v
 
 
-def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
+def histogram2d(x, y, bins=10, range=None, normed=None, weights=None,
+                density=None):
     """
     Compute the bi-dimensional histogram of two data samples.
 
@@ -560,9 +561,14 @@ def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
         (if not specified explicitly in the `bins` parameters):
         ``[[xmin, xmax], [ymin, ymax]]``. All values outside of this range
         will be considered outliers and not tallied in the histogram.
+    density : bool, optional
+        If False, the default, returns the number of samples in each bin.
+        If True, returns the probability *density* function at the bin,
+        ``bin_count / sample_count / bin_area``.
     normed : bool, optional
-        If False, returns the number of samples in each bin. If True,
-        returns the bin density ``bin_count / sample_count / bin_area``.
+        An alias for the density argument that behaves identically. To avoid
+        confusion with the broken normed argument to `histogram`, `density`
+        should be preferred.
     weights : array_like, shape(N,), optional
         An array of values ``w_i`` weighing each sample ``(x_i, y_i)``.
         Weights are normalized to 1 if `normed` is True. If `normed` is
@@ -652,7 +658,7 @@ def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
     if N != 1 and N != 2:
         xedges = yedges = asarray(bins)
         bins = [xedges, yedges]
-    hist, edges = histogramdd([x, y], bins, range, normed, weights)
+    hist, edges = histogramdd([x, y], bins, range, normed, weights, density)
     return hist, edges[0], edges[1]
 
 
diff --git a/numpy/linalg/__init__.py b/numpy/linalg/__init__.py
index 37bd27574..4b696c883 100644
--- a/numpy/linalg/__init__.py
+++ b/numpy/linalg/__init__.py
@@ -50,6 +50,6 @@ from .info import __doc__
 
 from .linalg import *
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 90dc2e657..8e7ad70cd 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -1527,7 +1527,6 @@ def svd(a, full_matrices=True, compute_uv=True):
 
     """
     a, wrap = _makearray(a)
-    _assertNoEmpty2d(a)
     _assertRankAtLeast2(a)
     t, result_t = _commonType(a)
 
@@ -1644,6 +1643,7 @@ def cond(x, p=None):
 
     """
     x = asarray(x)  # in case we have a matrix
+    _assertNoEmpty2d(x)
     if p is None or p == 2 or p == -2:
         s = svd(x, compute_uv=False)
         with errstate(all='ignore'):
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 87dfe988a..1c24f1e04 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -644,10 +644,6 @@ class TestEig(EigCases):
 class SVDCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
-        if 'size-0' in tags:
-            assert_raises(LinAlgError, linalg.svd, a, 0)
-            return
-
         u, s, vt = linalg.svd(a, 0)
         assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :],
                                            np.asarray(vt)),
@@ -670,15 +666,19 @@ class TestSVD(SVDCases):
         for dtype in [single, double, csingle, cdouble]:
             check(dtype)
 
-    def test_0_size(self):
-        # These raise errors currently
-        # (which does not mean that it may not make sense)
-        a = np.zeros((0, 0), dtype=np.complex64)
-        assert_raises(linalg.LinAlgError, linalg.svd, a)
-        a = np.zeros((0, 1), dtype=np.complex64)
-        assert_raises(linalg.LinAlgError, linalg.svd, a)
-        a = np.zeros((1, 0), dtype=np.complex64)
-        assert_raises(linalg.LinAlgError, linalg.svd, a)
+    def test_empty_identity(self):
+        """ Empty input should put an identity matrix in u or vh """
+        x = np.empty((4, 0))
+        u, s, vh = linalg.svd(x, compute_uv=True)
+        assert_equal(u.shape, (4, 4))
+        assert_equal(vh.shape, (0, 0))
+        assert_equal(u, np.eye(4))
+
+        x = np.empty((0, 4))
+        u, s, vh = linalg.svd(x, compute_uv=True)
+        assert_equal(u.shape, (0, 0))
+        assert_equal(vh.shape, (4, 4))
+        assert_equal(vh, np.eye(4))
 
 
 class CondCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 7dc1cb0cb..9fc68a7aa 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -2735,19 +2735,18 @@ static NPY_INLINE void
                            (fortran_int)dimensions[0],
                            (fortran_int)dimensions[1])) {
         LINEARIZE_DATA_t a_in, u_out, s_out, v_out;
+        fortran_int min_m_n = params.M < params.N ? params.M : params.N;
 
         init_linearize_data(&a_in, params.N, params.M, steps[1], steps[0]);
         if ('N' == params.JOBZ) {
             /* only the singular values are wanted */
-            fortran_int min_m_n = params.M < params.N? params.M : params.N;
             init_linearize_data(&s_out, 1, min_m_n, 0, steps[2]);
         } else {
             fortran_int u_columns, v_rows;
-            fortran_int min_m_n = params.M < params.N? params.M : params.N;
             if ('S' == params.JOBZ) {
                 u_columns = min_m_n;
                 v_rows = min_m_n;
-            } else {
+            } else { /* JOBZ == 'A' */
                 u_columns = params.M;
                 v_rows = params.N;
             }
@@ -2771,6 +2770,15 @@ static NPY_INLINE void
                 if ('N' == params.JOBZ) {
                     delinearize_@REALTYPE@_matrix(args[1], params.S, &s_out);
                 } else {
+                    if ('A' == params.JOBZ && min_m_n == 0) {
+                        /* Lapack has betrayed us and left these uninitialized,
+                         * so produce an identity matrix for whichever of u
+                         * and v is not empty.
+                         */
+                        identity_@TYPE@_matrix(params.U, params.M);
+                        identity_@TYPE@_matrix(params.VT, params.N);
+                    }
+
                     delinearize_@TYPE@_matrix(args[1], params.U, &u_out);
                     delinearize_@REALTYPE@_matrix(args[2], params.S, &s_out);
                     delinearize_@TYPE@_matrix(args[3], params.VT, &v_out);
diff --git a/numpy/ma/__init__.py b/numpy/ma/__init__.py
index 34f21b8b1..36ceb1f6e 100644
--- a/numpy/ma/__init__.py
+++ b/numpy/ma/__init__.py
@@ -51,6 +51,6 @@ __all__ = ['core', 'extras']
 __all__ += core.__all__
 __all__ += extras.__all__
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index 091ab4e20..5bfa51b12 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -7115,32 +7115,32 @@ size.__doc__ = np.size.__doc__
 
 def where(condition, x=_NoValue, y=_NoValue):
     """
-    Return a masked array with elements from x or y, depending on condition.
+    Return a masked array with elements from `x` or `y`, depending on condition.
 
-    Returns a masked array, shaped like condition, where the elements
-    are from `x` when `condition` is True, and from `y` otherwise.
-    If neither `x` nor `y` are given, the function returns a tuple of
-    indices where `condition` is True (the result of
-    ``condition.nonzero()``).
+    .. note::
+        When only `condition` is provided, this function is identical to
+        `nonzero`. The rest of this documentation covers only the case where
+        all three arguments are provided.
 
     Parameters
     ----------
     condition : array_like, bool
-        The condition to meet. For each True element, yield the corresponding
-        element from `x`, otherwise from `y`.
+        Where True, yield `x`, otherwise yield `y`. 
     x, y : array_like, optional
         Values from which to choose. `x`, `y` and `condition` need to be
         broadcastable to some shape.
 
     Returns
     -------
-    out : MaskedArray or tuple of ndarrays
-        The resulting masked array if `x` and `y` were given, otherwise
-        the result of ``condition.nonzero()``.
+    out : MaskedArray
+        An masked array with `masked` elements where the condition is masked,
+        elements from `x` where `condition` is True, and elements from `y`
+        elsewhere.
 
     See Also
     --------
     numpy.where : Equivalent function in the top-level NumPy module.
+    nonzero : The function that is called when x and y are omitted
 
     Examples
     --------
@@ -7151,9 +7151,6 @@ def where(condition, x=_NoValue, y=_NoValue):
     [[0.0 -- 2.0]
      [-- 4.0 --]
      [6.0 -- 8.0]]
-    >>> np.ma.where(x > 5)    # return the indices where x > 5
-    (array([2, 2]), array([0, 2]))
-
     >>> print(np.ma.where(x > 5, x, -3.1416))
     [[-3.1416 -- -3.1416]
      [-- -3.1416 --]
diff --git a/numpy/matrixlib/__init__.py b/numpy/matrixlib/__init__.py
index 3ad3a9549..777e0cd33 100644
--- a/numpy/matrixlib/__init__.py
+++ b/numpy/matrixlib/__init__.py
@@ -7,6 +7,6 @@ from .defmatrix import *
 
 __all__ = defmatrix.__all__
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/polynomial/__init__.py b/numpy/polynomial/__init__.py
index c18bebedb..85cee9ce6 100644
--- a/numpy/polynomial/__init__.py
+++ b/numpy/polynomial/__init__.py
@@ -22,6 +22,6 @@ from .hermite import Hermite
 from .hermite_e import HermiteE
 from .laguerre import Laguerre
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/polynomial/chebyshev.py b/numpy/polynomial/chebyshev.py
index 946e0499c..310c711ef 100644
--- a/numpy/polynomial/chebyshev.py
+++ b/numpy/polynomial/chebyshev.py
@@ -365,7 +365,7 @@ def poly2cheb(pol):
     >>> c = p.convert(kind=P.Chebyshev)
     >>> c
     Chebyshev([ 1.  ,  3.25,  1.  ,  0.75], domain=[-1,  1], window=[-1,  1])
-    >>> P.poly2cheb(range(4))
+    >>> P.chebyshev.poly2cheb(range(4))
     array([ 1.  ,  3.25,  1.  ,  0.75])
 
     """
@@ -417,7 +417,7 @@ def cheb2poly(c):
     >>> p = c.convert(kind=P.Polynomial)
     >>> p
     Polynomial([ -2.,  -8.,   4.,  12.], [-1.,  1.])
-    >>> P.cheb2poly(range(4))
+    >>> P.chebyshev.cheb2poly(range(4))
     array([ -2.,  -8.,   4.,  12.])
 
     """
diff --git a/numpy/random/__init__.py b/numpy/random/__init__.py
index 81cb94cc1..82aefce5f 100644
--- a/numpy/random/__init__.py
+++ b/numpy/random/__init__.py
@@ -117,6 +117,6 @@ def __RandomState_ctor():
     """
     return RandomState(seed=0)
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/random/mtrand/randomkit.c b/numpy/random/mtrand/randomkit.c
index 380917180..6371ebe33 100644
--- a/numpy/random/mtrand/randomkit.c
+++ b/numpy/random/mtrand/randomkit.c
@@ -616,7 +616,7 @@ rk_gauss(rk_state *state)
         }
         while (r2 >= 1.0 || r2 == 0.0);
 
-        /* Box-Muller transform */
+        /* Polar method, a more efficient version of the Box-Muller approach. */
         f = sqrt(-2.0*log(r2)/r2);
         /* Keep for next call */
         state->gauss = f*x1;
diff --git a/numpy/testing/__init__.py b/numpy/testing/__init__.py
index a7c85931c..a8bd4fc15 100644
--- a/numpy/testing/__init__.py
+++ b/numpy/testing/__init__.py
@@ -17,6 +17,6 @@ from ._private.nosetester import (
 
 __all__ = _private.utils.__all__ + ['TestCase', 'run_module_suite']
 
-from ._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester