130 files changed, 3704 insertions, 2582 deletions
diff --git a/numpy/__init__.py b/numpy/__init__.py
index d250ed5ac..2b8d41798 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -11,10 +11,10 @@ How to use the documentation
 ----------------------------
 Documentation is available in two forms: docstrings provided
 with the code, and a loose standing reference guide, available from
-`the NumPy homepage <http://www.scipy.org>`_.
+`the NumPy homepage <https://www.scipy.org>`_.
 
 We recommend exploring the docstrings using
-`IPython <http://ipython.scipy.org>`_, an advanced Python shell with
+`IPython <https://ipython.org>`_, an advanced Python shell with
 TAB-completion and introspection capabilities.  See below for further
 instructions.
 
@@ -139,9 +139,7 @@ else:
         loader = PackageLoader(infunc=True)
         return loader(*packages, **options)
 
-    from . import add_newdocs
-    __all__ = ['add_newdocs',
-               'ModuleDeprecationWarning',
+    __all__ = ['ModuleDeprecationWarning',
                'VisibleDeprecationWarning']
 
     pkgload.__doc__ = PackageLoader.__call__.__doc__
@@ -181,6 +179,11 @@ else:
     __all__.extend(lib.__all__)
     __all__.extend(['linalg', 'fft', 'random', 'ctypeslib', 'ma'])
 
+    # Filter out Cython harmless warnings
+    warnings.filterwarnings("ignore", message="numpy.dtype size changed")
+    warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
+    warnings.filterwarnings("ignore", message="numpy.ndarray size changed")
+
     # oldnumeric and numarray were removed in 1.9. In case some packages import
     # but do not use them, we define them here for backward compatibility.
     oldnumeric = 'removed'
@@ -191,7 +194,7 @@ else:
     from .testing import Tester
 
     # Pytest testing
-    from numpy.testing._private.pytesttester import PytestTester
+    from numpy._pytesttester import PytestTester
     test = PytestTester(__name__)
     del PytestTester
 
@@ -214,7 +217,9 @@ else:
         except AssertionError:
             msg = ("The current Numpy installation ({!r}) fails to "
                    "pass simple sanity checks. This can be caused for example "
-                   "by incorrect BLAS library being linked in.")
+                   "by incorrect BLAS library being linked in, or by mixing "
+                   "package managers (pip, conda, apt, ...). Search closed "
+                   "numpy issues for similar problems.")
             raise RuntimeError(msg.format(__file__))
 
     _sanity_check()
diff --git a/numpy/testing/_private/pytesttester.py b/numpy/_pytesttester.py
index 8c73fafa4..6a1b3274e 100644
--- a/numpy/testing/_private/pytesttester.py
+++ b/numpy/_pytesttester.py
@@ -5,7 +5,7 @@ This module implements the ``test()`` function for NumPy modules. The usual
 boiler plate for doing that is to put the following in the module
 ``__init__.py`` file::
 
-    from numpy.testing import PytestTester
+    from numpy._pytesttester import PytestTester
     test = PytestTester(__name__).test
     del PytestTester
 
@@ -23,6 +23,9 @@ whether or not that file is found as follows:
 In practice, tests run from the numpy repo are run in develop mode. That
 includes the standard ``python runtests.py`` invocation.
 
+This module is imported by every numpy subpackage, so lies at the top level to
+simplify circular import issues. For the same reason, it contains no numpy
+imports at module scope, instead importing numpy within function calls.
 """
 from __future__ import division, absolute_import, print_function
 
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index 4d9cbf5da..9ef30b018 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -59,6 +59,10 @@ del nt
 from .fromnumeric import amax as max, amin as min, round_ as round
 from .numeric import absolute as abs
 
+# do this after everything else, to minimize the chance of this misleadingly
+# appearing in an import-time traceback
+from . import _add_newdocs
+
 __all__ = ['char', 'rec', 'memmap']
 __all__ += numeric.__all__
 __all__ += fromnumeric.__all__
@@ -100,6 +104,6 @@ del copyreg
 del sys
 del _ufunc_reduce
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/add_newdocs.py b/numpy/core/_add_newdocs.py
index 9372b3431..b65920fde 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -10,7 +10,7 @@ NOTE: Many of the methods of ndarray have corresponding functions.
 """
 from __future__ import division, absolute_import, print_function
 
-from numpy.lib import add_newdoc
+from numpy.core.function_base import add_newdoc
 
 ###############################################################################
 #
@@ -605,6 +605,7 @@ add_newdoc('numpy.core', 'broadcast',
 
     Examples
     --------
+
     Manually adding two vectors, using broadcasting:
 
     >>> x = np.array([[1], [2], [3]])
@@ -1318,6 +1319,7 @@ add_newdoc('numpy.core.multiarray', 'concatenate',
     hstack : Stack arrays in sequence horizontally (column wise)
     vstack : Stack arrays in sequence vertically (row wise)
     dstack : Stack arrays in sequence depth wise (along third dimension)
+    block : Assemble arrays from blocks.
 
     Notes
     -----
@@ -1347,19 +1349,19 @@ add_newdoc('numpy.core.multiarray', 'concatenate',
     >>> a[1] = np.ma.masked
     >>> b = np.arange(2, 5)
     >>> a
-    masked_array(data = [0 -- 2],
-                 mask = [False  True False],
-           fill_value = 999999)
+    masked_array(data=[0, --, 2],
+                 mask=[False,  True, False],
+           fill_value=999999)
     >>> b
     array([2, 3, 4])
     >>> np.concatenate([a, b])
-    masked_array(data = [0 1 2 2 3 4],
-                 mask = False,
-           fill_value = 999999)
+    masked_array(data=[0, 1, 2, 2, 3, 4],
+                 mask=False,
+           fill_value=999999)
     >>> np.ma.concatenate([a, b])
-    masked_array(data = [0 -- 2 2 3 4],
-                 mask = [False  True False False False False],
-           fill_value = 999999)
+    masked_array(data=[0, --, 2, 2, 3, 4],
+                 mask=[False,  True, False, False, False, False],
+           fill_value=999999)
 
     """)
 
@@ -1452,7 +1454,7 @@ add_newdoc('numpy.core.multiarray', 'arange',
     Values are generated within the half-open interval ``[start, stop)``
     (in other words, the interval including `start` but excluding `stop`).
     For integer arguments the function is equivalent to the Python built-in
-    `range <http://docs.python.org/lib/built-in-funcs.html>`_ function,
+    `range <https://docs.python.org/library/functions.html#func-range>`_ function,
     but returns an ndarray rather than a list.
 
     When using a non-integer step, such as 0.1, the results will often not
@@ -1576,71 +1578,72 @@ add_newdoc('numpy.core.multiarray', 'where',
     """
     where(condition, [x, y])
 
-    Return elements, either from `x` or `y`, depending on `condition`.
+    Return elements chosen from `x` or `y` depending on `condition`.
 
-    If only `condition` is given, return ``condition.nonzero()``.
+    .. note::
+        When only `condition` is provided, this function is a shorthand for
+        ``np.asarray(condition).nonzero()``. Using `nonzero` directly should be
+        preferred, as it behaves correctly for subclasses. The rest of this
+        documentation covers only the case where all three arguments are
+        provided.
 
     Parameters
     ----------
     condition : array_like, bool
-        When True, yield `x`, otherwise yield `y`.
-    x, y : array_like, optional
+        Where True, yield `x`, otherwise yield `y`.
+    x, y : array_like
         Values from which to choose. `x`, `y` and `condition` need to be
         broadcastable to some shape.
 
     Returns
     -------
-    out : ndarray or tuple of ndarrays
-        If both `x` and `y` are specified, the output array contains
-        elements of `x` where `condition` is True, and elements from
-        `y` elsewhere.
-
-        If only `condition` is given, return the tuple
-        ``condition.nonzero()``, the indices where `condition` is True.
+    out : ndarray
+        An array with elements from `x` where `condition` is True, and elements
+        from `y` elsewhere.
 
     See Also
     --------
-    nonzero, choose
+    choose
+    nonzero : The function that is called when x and y are omitted
 
     Notes
     -----
-    If `x` and `y` are given and input arrays are 1-D, `where` is
-    equivalent to::
+    If all the arrays are 1-D, `where` is equivalent to::
 
-        [xv if c else yv for (c,xv,yv) in zip(condition,x,y)]
+        [xv if c else yv
+         for c, xv, yv in zip(condition, x, y)]
 
     Examples
     --------
+    >>> a = np.arange(10)
+    >>> a
+    array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    >>> np.where(a < 5, a, 10*a)
+    array([ 0,  1,  2,  3,  4, 50, 60, 70, 80, 90])
+
+    This can be used on multidimensional arrays too:
+
     >>> np.where([[True, False], [True, True]],
     ...          [[1, 2], [3, 4]],
     ...          [[9, 8], [7, 6]])
     array([[1, 8],
            [3, 4]])
 
-    >>> np.where([[0, 1], [1, 0]])
-    (array([0, 1]), array([1, 0]))
-
-    >>> x = np.arange(9.).reshape(3, 3)
-    >>> np.where( x > 5 )
-    (array([2, 2, 2]), array([0, 1, 2]))
-    >>> x[np.where( x > 3.0 )]               # Note: result is 1D.
-    array([ 4.,  5.,  6.,  7.,  8.])
-    >>> np.where(x < 5, x, -1)               # Note: broadcasting.
-    array([[ 0.,  1.,  2.],
-           [ 3.,  4., -1.],
-           [-1., -1., -1.]])
-
-    Find the indices of elements of `x` that are in `goodvalues`.
-
-    >>> goodvalues = [3, 4, 7]
-    >>> ix = np.isin(x, goodvalues)
-    >>> ix
-    array([[False, False, False],
-           [ True,  True, False],
-           [False,  True, False]])
-    >>> np.where(ix)
-    (array([1, 1, 2]), array([0, 1, 1]))
+    The shapes of x, y, and the condition are broadcast together:
+
+    >>> x, y = np.ogrid[:3, :4]
+    >>> np.where(x < y, x, 10 + y)  # both x and 10+y are broadcast
+    array([[10,  0,  0,  0],
+           [10, 11,  1,  1],
+           [10, 11, 12,  2]])
 
+    >>> a = np.array([[0, 1, 2],
+    ...               [0, 2, 4],
+    ...               [0, 3, 6]])
+    >>> np.where(a < 4, a, -1)  # -1 is broadcast
+    array([[ 0,  1,  2],
+           [ 0,  2, -1],
+           [ 0,  3, -1]])
     """)
 
 
@@ -2265,25 +2268,89 @@ add_newdoc('numpy.core', 'matmul',
 
     """)
 
+add_newdoc('numpy.core', 'vdot',
+    """
+    vdot(a, b)
+
+    Return the dot product of two vectors.
+
+    The vdot(`a`, `b`) function handles complex numbers differently than
+    dot(`a`, `b`).  If the first argument is complex the complex conjugate
+    of the first argument is used for the calculation of the dot product.
+
+    Note that `vdot` handles multidimensional arrays differently than `dot`:
+    it does *not* perform a matrix product, but flattens input arguments
+    to 1-D vectors first. Consequently, it should only be used for vectors.
+
+    Parameters
+    ----------
+    a : array_like
+        If `a` is complex the complex conjugate is taken before calculation
+        of the dot product.
+    b : array_like
+        Second argument to the dot product.
+
+    Returns
+    -------
+    output : ndarray
+        Dot product of `a` and `b`.  Can be an int, float, or
+        complex depending on the types of `a` and `b`.
+
+    See Also
+    --------
+    dot : Return the dot product without using the complex conjugate of the
+          first argument.
+
+    Examples
+    --------
+    >>> a = np.array([1+2j,3+4j])
+    >>> b = np.array([5+6j,7+8j])
+    >>> np.vdot(a, b)
+    (70-8j)
+    >>> np.vdot(b, a)
+    (70+8j)
+
+    Note that higher-dimensional arrays are flattened!
+
+    >>> a = np.array([[1, 4], [5, 6]])
+    >>> b = np.array([[4, 1], [2, 2]])
+    >>> np.vdot(a, b)
+    30
+    >>> np.vdot(b, a)
+    30
+    >>> 1*4 + 4*1 + 5*2 + 6*2
+    30
+
+    """)
 
-add_newdoc('numpy.core', 'c_einsum',
+add_newdoc('numpy.core.multiarray', 'c_einsum',
     """
-    c_einsum(subscripts, *operands, out=None, dtype=None, order='K', casting='safe')
+    c_einsum(subscripts, *operands, out=None, dtype=None, order='K',
+           casting='safe')
+           
+    *This documentation shadows that of the native python implementation of the `einsum` function,
+    except all references and examples related to the `optimize` argument (v 0.12.0) have been removed.*
 
     Evaluates the Einstein summation convention on the operands.
 
-    Using the Einstein summation convention, many common multi-dimensional
-    array operations can be represented in a simple fashion.  This function
-    provides a way to compute such summations. The best way to understand this
-    function is to try the examples below, which show how many common NumPy
-    functions can be implemented as calls to `einsum`.
+    Using the Einstein summation convention, many common multi-dimensional,
+    linear algebraic array operations can be represented in a simple fashion.
+    In *implicit* mode `einsum` computes these values.
 
-    This is the core C function.
+    In *explicit* mode, `einsum` provides further flexibility to compute
+    other array operations that might not be considered classical Einstein
+    summation operations, by disabling, or forcing summation over specified
+    subscript labels.
+
+    See the notes and examples for clarification.
 
     Parameters
     ----------
     subscripts : str
-        Specifies the subscripts for summation.
+        Specifies the subscripts for summation as comma separated list of
+        subscript labels. An implicit (classical Einstein summation)
+        calculation is performed unless the explicit indicator '->' is
+        included as well as subscript labels of the precise output form.
     operands : list of array_like
         These are the arrays for the operation.
     out : ndarray, optional
@@ -2311,6 +2378,11 @@ add_newdoc('numpy.core', 'c_einsum',
           * 'unsafe' means any data conversions may be done.
 
         Default is 'safe'.
+    optimize : {False, True, 'greedy', 'optimal'}, optional
+        Controls if intermediate optimization should occur. No optimization
+        will occur if False and True will default to the 'greedy' algorithm.
+        Also accepts an explicit contraction list from the ``np.einsum_path``
+        function. See ``np.einsum_path`` for more details. Defaults to False.
 
     Returns
     -------
@@ -2319,56 +2391,86 @@ add_newdoc('numpy.core', 'c_einsum',
 
     See Also
     --------
-    einsum, dot, inner, outer, tensordot
+    einsum_path, dot, inner, outer, tensordot, linalg.multi_dot
 
     Notes
     -----
     .. versionadded:: 1.6.0
 
-    The subscripts string is a comma-separated list of subscript labels,
-    where each label refers to a dimension of the corresponding operand.
-    Repeated subscripts labels in one operand take the diagonal.  For example,
-    ``np.einsum('ii', a)`` is equivalent to ``np.trace(a)``.
+    The Einstein summation convention can be used to compute
+    many multi-dimensional, linear algebraic array operations. `einsum`
+    provides a succinct way of representing these.
 
-    Whenever a label is repeated, it is summed, so ``np.einsum('i,i', a, b)``
-    is equivalent to ``np.inner(a,b)``.  If a label appears only once,
-    it is not summed, so ``np.einsum('i', a)`` produces a view of ``a``
-    with no changes.
+    A non-exhaustive list of these operations,
+    which can be computed by `einsum`, is shown below along with examples:
 
-    The order of labels in the output is by default alphabetical.  This
-    means that ``np.einsum('ij', a)`` doesn't affect a 2D array, while
-    ``np.einsum('ji', a)`` takes its transpose.
+    * Trace of an array, :py:func:`numpy.trace`.
+    * Return a diagonal, :py:func:`numpy.diag`.
+    * Array axis summations, :py:func:`numpy.sum`.
+    * Transpositions and permutations, :py:func:`numpy.transpose`.
+    * Matrix multiplication and dot product, :py:func:`numpy.matmul` :py:func:`numpy.dot`.
+    * Vector inner and outer products, :py:func:`numpy.inner` :py:func:`numpy.outer`.
+    * Broadcasting, element-wise and scalar multiplication, :py:func:`numpy.multiply`.
+    * Tensor contractions, :py:func:`numpy.tensordot`.
+    * Chained array operations, in efficient calculation order, :py:func:`numpy.einsum_path`.
 
-    The output can be controlled by specifying output subscript labels
-    as well.  This specifies the label order, and allows summing to
-    be disallowed or forced when desired.  The call ``np.einsum('i->', a)``
-    is like ``np.sum(a, axis=-1)``, and ``np.einsum('ii->i', a)``
-    is like ``np.diag(a)``.  The difference is that `einsum` does not
-    allow broadcasting by default.
+    The subscripts string is a comma-separated list of subscript labels,
+    where each label refers to a dimension of the corresponding operand.
+    Whenever a label is repeated it is summed, so ``np.einsum('i,i', a, b)``
+    is equivalent to :py:func:`np.inner(a,b) <numpy.inner>`. If a label
+    appears only once, it is not summed, so ``np.einsum('i', a)`` produces a
+    view of ``a`` with no changes. A further example ``np.einsum('ij,jk', a, b)``
+    describes traditional matrix multiplication and is equivalent to
+    :py:func:`np.matmul(a,b) <numpy.matmul>`. Repeated subscript labels in one
+    operand take the diagonal. For example, ``np.einsum('ii', a)`` is equivalent
+    to :py:func:`np.trace(a) <numpy.trace>`.
+
+    In *implicit mode*, the chosen subscripts are important
+    since the axes of the output are reordered alphabetically.  This
+    means that ``np.einsum('ij', a)`` doesn't affect a 2D array, while
+    ``np.einsum('ji', a)`` takes its transpose. Additionally,
+    ``np.einsum('ij,jk', a, b)`` returns a matrix multiplication, while,
+    ``np.einsum('ij,jh', a, b)`` returns the transpose of the
+    multiplication since subscript 'h' precedes subscript 'i'.
+
+    In *explicit mode* the output can be directly controlled by
+    specifying output subscript labels.  This requires the
+    identifier '->' as well as the list of output subscript labels.
+    This feature increases the flexibility of the function since
+    summing can be disabled or forced when required. The call
+    ``np.einsum('i->', a)`` is like :py:func:`np.sum(a, axis=-1) <numpy.sum>`,
+    and ``np.einsum('ii->i', a)`` is like :py:func:`np.diag(a) <numpy.diag>`.
+    The difference is that `einsum` does not allow broadcasting by default.
+    Additionally ``np.einsum('ij,jh->ih', a, b)`` directly specifies the
+    order of the output subscript labels and therefore returns matrix
+    multiplication, unlike the example above in implicit mode.
 
     To enable and control broadcasting, use an ellipsis.  Default
     NumPy-style broadcasting is done by adding an ellipsis
     to the left of each term, like ``np.einsum('...ii->...i', a)``.
     To take the trace along the first and last axes,
     you can do ``np.einsum('i...i', a)``, or to do a matrix-matrix
-    product with the left-most indices instead of rightmost, you can do
+    product with the left-most indices instead of rightmost, one can do
     ``np.einsum('ij...,jk...->ik...', a, b)``.
 
     When there is only one operand, no axes are summed, and no output
     parameter is provided, a view into the operand is returned instead
     of a new array.  Thus, taking the diagonal as ``np.einsum('ii->i', a)``
-    produces a view.
+    produces a view (changed in version 1.10.0).
 
-    An alternative way to provide the subscripts and operands is as
-    ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``. The examples
-    below have corresponding `einsum` calls with the two parameter methods.
+    `einsum` also provides an alternative way to provide the subscripts
+    and operands as ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``.
+    If the output shape is not provided in this format `einsum` will be
+    calculated in implicit mode, otherwise it will be performed explicitly.
+    The examples below have corresponding `einsum` calls with the two
+    parameter methods.
 
     .. versionadded:: 1.10.0
 
     Views returned from einsum are now writeable whenever the input array
     is writeable. For example, ``np.einsum('ijk...->kji...', a)`` will now
-    have the same effect as ``np.swapaxes(a, 0, 2)`` and
-    ``np.einsum('ii->i', a)`` will return a writeable view of the diagonal
+    have the same effect as :py:func:`np.swapaxes(a, 0, 2) <numpy.swapaxes>`
+    and ``np.einsum('ii->i', a)`` will return a writeable view of the diagonal
     of a 2D array.
 
     Examples
@@ -2377,6 +2479,8 @@ add_newdoc('numpy.core', 'c_einsum',
     >>> b = np.arange(5)
     >>> c = np.arange(6).reshape(2,3)
 
+    Trace of a matrix:
+
     >>> np.einsum('ii', a)
     60
     >>> np.einsum(a, [0,0])
@@ -2384,6 +2488,8 @@ add_newdoc('numpy.core', 'c_einsum',
     >>> np.trace(a)
     60
 
+    Extract the diagonal (requires explicit form):
+
     >>> np.einsum('ii->i', a)
     array([ 0,  6, 12, 18, 24])
     >>> np.einsum(a, [0,0], [0])
@@ -2391,31 +2497,69 @@ add_newdoc('numpy.core', 'c_einsum',
     >>> np.diag(a)
     array([ 0,  6, 12, 18, 24])
 
-    >>> np.einsum('ij,j', a, b)
-    array([ 30,  80, 130, 180, 230])
-    >>> np.einsum(a, [0,1], b, [1])
-    array([ 30,  80, 130, 180, 230])
-    >>> np.dot(a, b)
-    array([ 30,  80, 130, 180, 230])
-    >>> np.einsum('...j,j', a, b)
-    array([ 30,  80, 130, 180, 230])
+    Sum over an axis (requires explicit form):
+
+    >>> np.einsum('ij->i', a)
+    array([ 10,  35,  60,  85, 110])
+    >>> np.einsum(a, [0,1], [0])
+    array([ 10,  35,  60,  85, 110])
+    >>> np.sum(a, axis=1)
+    array([ 10,  35,  60,  85, 110])
+
+    For higher dimensional arrays summing a single axis can be done with ellipsis:
+
+    >>> np.einsum('...j->...', a)
+    array([ 10,  35,  60,  85, 110])
+    >>> np.einsum(a, [Ellipsis,1], [Ellipsis])
+    array([ 10,  35,  60,  85, 110])
+
+    Compute a matrix transpose, or reorder any number of axes:
 
     >>> np.einsum('ji', c)
     array([[0, 3],
            [1, 4],
            [2, 5]])
+    >>> np.einsum('ij->ji', c)
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
     >>> np.einsum(c, [1,0])
     array([[0, 3],
            [1, 4],
            [2, 5]])
-    >>> c.T
+    >>> np.transpose(c)
     array([[0, 3],
            [1, 4],
            [2, 5]])
 
+    Vector inner products:
+
+    >>> np.einsum('i,i', b, b)
+    30
+    >>> np.einsum(b, [0], b, [0])
+    30
+    >>> np.inner(b,b)
+    30
+
+    Matrix vector multiplication:
+
+    >>> np.einsum('ij,j', a, b)
+    array([ 30,  80, 130, 180, 230])
+    >>> np.einsum(a, [0,1], b, [1])
+    array([ 30,  80, 130, 180, 230])
+    >>> np.dot(a, b)
+    array([ 30,  80, 130, 180, 230])
+    >>> np.einsum('...j,j', a, b)
+    array([ 30,  80, 130, 180, 230])
+
+    Broadcasting and scalar multiplication:
+
     >>> np.einsum('..., ...', 3, c)
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
+    >>> np.einsum(',ij', 3, c)
+    array([[ 0,  3,  6],
+           [ 9, 12, 15]])
     >>> np.einsum(3, [Ellipsis], c, [Ellipsis])
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
@@ -2423,12 +2567,7 @@ add_newdoc('numpy.core', 'c_einsum',
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
 
-    >>> np.einsum('i,i', b, b)
-    30
-    >>> np.einsum(b, [0], b, [0])
-    30
-    >>> np.inner(b,b)
-    30
+    Vector outer product:
 
     >>> np.einsum('i,j', np.arange(2)+1, b)
     array([[0, 1, 2, 3, 4],
@@ -2440,12 +2579,7 @@ add_newdoc('numpy.core', 'c_einsum',
     array([[0, 1, 2, 3, 4],
            [0, 2, 4, 6, 8]])
 
-    >>> np.einsum('i...->...', a)
-    array([50, 55, 60, 65, 70])
-    >>> np.einsum(a, [0,Ellipsis], [Ellipsis])
-    array([50, 55, 60, 65, 70])
-    >>> np.sum(a, axis=0)
-    array([50, 55, 60, 65, 70])
+    Tensor contraction:
 
     >>> a = np.arange(60.).reshape(3,4,5)
     >>> b = np.arange(24.).reshape(4,3,2)
@@ -2468,6 +2602,17 @@ add_newdoc('numpy.core', 'c_einsum',
            [ 4796.,  5162.],
            [ 4928.,  5306.]])
 
+    Writeable returned arrays (since version 1.10.0):
+
+    >>> a = np.zeros((3, 3))
+    >>> np.einsum('ii->i', a)[:] = 1
+    >>> a
+    array([[ 1.,  0.,  0.],
+           [ 0.,  1.,  0.],
+           [ 0.,  0.,  1.]])
+
+    Example of ellipsis use:
+
     >>> a = np.arange(6).reshape((3,2))
     >>> b = np.arange(12).reshape((4,3))
     >>> np.einsum('ki,jk->ij', a, b)
@@ -2480,69 +2625,6 @@ add_newdoc('numpy.core', 'c_einsum',
     array([[10, 28, 46, 64],
            [13, 40, 67, 94]])
 
-    >>> # since version 1.10.0
-    >>> a = np.zeros((3, 3))
-    >>> np.einsum('ii->i', a)[:] = 1
-    >>> a
-    array([[ 1.,  0.,  0.],
-           [ 0.,  1.,  0.],
-           [ 0.,  0.,  1.]])
-
-    """)
-
-add_newdoc('numpy.core', 'vdot',
-    """
-    vdot(a, b)
-
-    Return the dot product of two vectors.
-
-    The vdot(`a`, `b`) function handles complex numbers differently than
-    dot(`a`, `b`).  If the first argument is complex the complex conjugate
-    of the first argument is used for the calculation of the dot product.
-
-    Note that `vdot` handles multidimensional arrays differently than `dot`:
-    it does *not* perform a matrix product, but flattens input arguments
-    to 1-D vectors first. Consequently, it should only be used for vectors.
-
-    Parameters
-    ----------
-    a : array_like
-        If `a` is complex the complex conjugate is taken before calculation
-        of the dot product.
-    b : array_like
-        Second argument to the dot product.
-
-    Returns
-    -------
-    output : ndarray
-        Dot product of `a` and `b`.  Can be an int, float, or
-        complex depending on the types of `a` and `b`.
-
-    See Also
-    --------
-    dot : Return the dot product without using the complex conjugate of the
-          first argument.
-
-    Examples
-    --------
-    >>> a = np.array([1+2j,3+4j])
-    >>> b = np.array([5+6j,7+8j])
-    >>> np.vdot(a, b)
-    (70-8j)
-    >>> np.vdot(b, a)
-    (70+8j)
-
-    Note that higher-dimensional arrays are flattened!
-
-    >>> a = np.array([[1, 4], [5, 6]])
-    >>> b = np.array([[4, 1], [2, 2]])
-    >>> np.vdot(a, b)
-    30
-    >>> np.vdot(b, a)
-    30
-    >>> 1*4 + 4*1 + 5*2 + 6*2
-    30
-
     """)
 
 
@@ -5215,99 +5297,6 @@ add_newdoc('numpy.core.umath', 'seterrobj',
 #
 ##############################################################################
 
-add_newdoc('numpy.core.multiarray', 'digitize',
-    """
-    digitize(x, bins, right=False)
-
-    Return the indices of the bins to which each value in input array belongs.
-
-    =========  =============  ============================
-    `right`    order of bins  returned index `i` satisfies
-    =========  =============  ============================
-    ``False``  increasing     ``bins[i-1] <= x < bins[i]``
-    ``True``   increasing     ``bins[i-1] < x <= bins[i]``
-    ``False``  decreasing     ``bins[i-1] > x >= bins[i]``
-    ``True``   decreasing     ``bins[i-1] >= x > bins[i]``
-    =========  =============  ============================
-
-    If values in `x` are beyond the bounds of `bins`, 0 or ``len(bins)`` is
-    returned as appropriate.
-
-    Parameters
-    ----------
-    x : array_like
-        Input array to be binned. Prior to NumPy 1.10.0, this array had to
-        be 1-dimensional, but can now have any shape.
-    bins : array_like
-        Array of bins. It has to be 1-dimensional and monotonic.
-    right : bool, optional
-        Indicating whether the intervals include the right or the left bin
-        edge. Default behavior is (right==False) indicating that the interval
-        does not include the right edge. The left bin end is open in this
-        case, i.e., bins[i-1] <= x < bins[i] is the default behavior for
-        monotonically increasing bins.
-
-    Returns
-    -------
-    indices : ndarray of ints
-        Output array of indices, of same shape as `x`.
-
-    Raises
-    ------
-    ValueError
-        If `bins` is not monotonic.
-    TypeError
-        If the type of the input is complex.
-
-    See Also
-    --------
-    bincount, histogram, unique, searchsorted
-
-    Notes
-    -----
-    If values in `x` are such that they fall outside the bin range,
-    attempting to index `bins` with the indices that `digitize` returns
-    will result in an IndexError.
-
-    .. versionadded:: 1.10.0
-
-    `np.digitize` is  implemented in terms of `np.searchsorted`. This means
-    that a binary search is used to bin the values, which scales much better
-    for larger number of bins than the previous linear search. It also removes
-    the requirement for the input array to be 1-dimensional.
-
-    For monotonically _increasing_ `bins`, the following are equivalent::
-
-        np.digitize(x, bins, right=True)
-        np.searchsorted(bins, x, side='left')
-
-    Note that as the order of the arguments are reversed, the side must be too.
-    The `searchsorted` call is marginally faster, as it does not do any
-    monotonicity checks. Perhaps more importantly, it supports all dtypes.
-
-    Examples
-    --------
-    >>> x = np.array([0.2, 6.4, 3.0, 1.6])
-    >>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
-    >>> inds = np.digitize(x, bins)
-    >>> inds
-    array([1, 4, 3, 2])
-    >>> for n in range(x.size):
-    ...   print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]])
-    ...
-    0.0 <= 0.2 < 1.0
-    4.0 <= 6.4 < 10.0
-    2.5 <= 3.0 < 4.0
-    1.0 <= 1.6 < 2.5
-
-    >>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.])
-    >>> bins = np.array([0, 5, 10, 15, 20])
-    >>> np.digitize(x,bins,right=True)
-    array([1, 2, 3, 4, 4])
-    >>> np.digitize(x,bins,right=False)
-    array([1, 3, 3, 4, 5])
-    """)
-
 add_newdoc('numpy.core.multiarray', 'bincount',
     """
     bincount(x, weights=None, minlength=0)
@@ -7144,8 +7133,8 @@ add_newdoc('numpy.core.multiarray', 'datetime_data',
 
     Get information about the step size of a date or time type.
 
-    The returned tuple can be passed as the second argument of `datetime64` and
-    `timedelta64`.
+    The returned tuple can be passed as the second argument of `numpy.datetime64` and
+    `numpy.timedelta64`.
 
     Parameters
     ----------
@@ -7175,94 +7164,6 @@ add_newdoc('numpy.core.multiarray', 'datetime_data',
     numpy.datetime64('2010-01-01T00:00:00','25s')
     """)
 
-##############################################################################
-#
-# nd_grid instances
-#
-##############################################################################
-
-add_newdoc('numpy.lib.index_tricks', 'mgrid',
-    """
-    `nd_grid` instance which returns a dense multi-dimensional "meshgrid".
-
-    An instance of `numpy.lib.index_tricks.nd_grid` which returns an dense
-    (or fleshed out) mesh-grid when indexed, so that each returned argument
-    has the same shape.  The dimensions and number of the output arrays are
-    equal to the number of indexing dimensions.  If the step length is not a
-    complex number, then the stop is not inclusive.
-
-    However, if the step length is a **complex number** (e.g. 5j), then
-    the integer part of its magnitude is interpreted as specifying the
-    number of points to create between the start and stop values, where
-    the stop value **is inclusive**.
-
-    Returns
-    ----------
-    mesh-grid `ndarrays` all of the same dimensions
-
-    See Also
-    --------
-    numpy.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
-    ogrid : like mgrid but returns open (not fleshed out) mesh grids
-    r_ : array concatenator
-
-    Examples
-    --------
-    >>> np.mgrid[0:5,0:5]
-    array([[[0, 0, 0, 0, 0],
-            [1, 1, 1, 1, 1],
-            [2, 2, 2, 2, 2],
-            [3, 3, 3, 3, 3],
-            [4, 4, 4, 4, 4]],
-           [[0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4]]])
-    >>> np.mgrid[-1:1:5j]
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-
-    """)
-
-add_newdoc('numpy.lib.index_tricks', 'ogrid',
-    """
-    `nd_grid` instance which returns an open multi-dimensional "meshgrid".
-
-    An instance of `numpy.lib.index_tricks.nd_grid` which returns an open
-    (i.e. not fleshed out) mesh-grid when indexed, so that only one dimension
-    of each returned array is greater than 1.  The dimension and number of the
-    output arrays are equal to the number of indexing dimensions.  If the step
-    length is not a complex number, then the stop is not inclusive.
-
-    However, if the step length is a **complex number** (e.g. 5j), then
-    the integer part of its magnitude is interpreted as specifying the
-    number of points to create between the start and stop values, where
-    the stop value **is inclusive**.
-
-    Returns
-    ----------
-    mesh-grid `ndarrays` with only one dimension :math:`\\neq 1`
-
-    See Also
-    --------
-    np.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
-    mgrid : like `ogrid` but returns dense (or fleshed out) mesh grids
-    r_ : array concatenator
-
-    Examples
-    --------
-    >>> from numpy import ogrid
-    >>> ogrid[-1:1:5j]
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-    >>> ogrid[0:5,0:5]
-    [array([[0],
-            [1],
-            [2],
-            [3],
-            [4]]), array([[0, 1, 2, 3, 4]])]
-
-    """)
-
 
 ##############################################################################
 #
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 6d15cb23f..a4b5aecc3 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -528,6 +528,8 @@ def array2string(a, max_line_width=None, precision=None,
 
         The output is left-padded by the length of the prefix string, and
         wrapping is forced at the column ``max_line_width - len(suffix)``.
+        It should be noted that the content of prefix and suffix strings are
+        not included in the output.
     style : _NoValue, optional
         Has no effect, do not use.
 
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index cc6c3a5fb..43c32eac6 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -39,8 +39,7 @@
 0x0000000b = edb1ba83730c650fd9bc5772a919cda7
 
 # Version 12 (NumPy 1.14) Added PyArray_ResolveWritebackIfCopy,
+# Version 12 (NumPy 1.15) No change.
 # PyArray_SetWritebackIfCopyBase and deprecated PyArray_SetUpdateIfCopyBase.
 0x0000000c = a1bc756c5782853ec2e3616cf66869d8
 
-# Version 13 (NumPy 1.15) Added NpyIter_Close
-0x0000000d = 4386e829d65aafce6bd09a85b142d585
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 6cfbbbcc7..d8a9ee6b4 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -350,8 +350,6 @@ multiarray_funcs_api = {
     'PyArray_ResolveWritebackIfCopy':       (302,),
     'PyArray_SetWritebackIfCopyBase':       (303,),
     # End 1.14 API
-    'NpyIter_Close':                        (304,),
-    # End 1.15 API
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index f7d58a26f..6e5cb25af 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -233,7 +233,7 @@ add_newdoc('numpy.core.umath', 'arccosh',
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
            10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
     .. [2] Wikipedia, "Inverse hyperbolic function",
-           http://en.wikipedia.org/wiki/Arccosh
+           https://en.wikipedia.org/wiki/Arccosh
 
     Examples
     --------
@@ -335,7 +335,7 @@ add_newdoc('numpy.core.umath', 'arcsinh',
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
            10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
     .. [2] Wikipedia, "Inverse hyperbolic function",
-           http://en.wikipedia.org/wiki/Arcsinh
+           https://en.wikipedia.org/wiki/Arcsinh
 
     Examples
     --------
@@ -535,7 +535,7 @@ add_newdoc('numpy.core.umath', 'arctanh',
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
            10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
     .. [2] Wikipedia, "Inverse hyperbolic function",
-           http://en.wikipedia.org/wiki/Arctanh
+           https://en.wikipedia.org/wiki/Arctanh
 
     Examples
     --------
@@ -1136,7 +1136,7 @@ add_newdoc('numpy.core.umath', 'exp',
     References
     ----------
     .. [1] Wikipedia, "Exponential function",
-           http://en.wikipedia.org/wiki/Exponential_function
+           https://en.wikipedia.org/wiki/Exponential_function
     .. [2] M. Abramovitz and I. A. Stegun, "Handbook of Mathematical Functions
            with Formulas, Graphs, and Mathematical Tables," Dover, 1964, p. 69,
            http://www.math.sfu.ca/~cbm/aands/page_69.htm
@@ -1551,7 +1551,7 @@ add_newdoc('numpy.core.umath', 'invert',
     References
     ----------
     .. [1] Wikipedia, "Two's complement",
-        http://en.wikipedia.org/wiki/Two's_complement
+        https://en.wikipedia.org/wiki/Two's_complement
 
     Examples
     --------
@@ -1740,6 +1740,8 @@ add_newdoc('numpy.core.umath', 'isnat',
     """
     Test element-wise for NaT (not a time) and return result as a boolean array.
 
+    .. versionadded:: 1.13.0
+
     Parameters
     ----------
     x : array_like
@@ -1912,7 +1914,7 @@ add_newdoc('numpy.core.umath', 'log',
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
            10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
-    .. [2] Wikipedia, "Logarithm". http://en.wikipedia.org/wiki/Logarithm
+    .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm
 
     Examples
     --------
@@ -1961,7 +1963,7 @@ add_newdoc('numpy.core.umath', 'log10',
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
            10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
-    .. [2] Wikipedia, "Logarithm". http://en.wikipedia.org/wiki/Logarithm
+    .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm
 
     Examples
     --------
@@ -2147,7 +2149,7 @@ add_newdoc('numpy.core.umath', 'log1p',
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
            10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
-    .. [2] Wikipedia, "Logarithm". http://en.wikipedia.org/wiki/Logarithm
+    .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm
 
     Examples
     --------
@@ -3578,7 +3580,7 @@ add_newdoc('numpy.core.umath', 'tanh',
            http://www.math.sfu.ca/~cbm/aands/
 
     .. [2] Wikipedia, "Hyperbolic function",
-           http://en.wikipedia.org/wiki/Hyperbolic_function
+           https://en.wikipedia.org/wiki/Hyperbolic_function
 
     Examples
     --------
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index a4c18d482..163f125c2 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -4,6 +4,8 @@ Implementation of optimized einsum.
 """
 from __future__ import division, absolute_import, print_function
 
+import itertools
+
 from numpy.compat import basestring
 from numpy.core.multiarray import c_einsum
 from numpy.core.numeric import asarray, asanyarray, result_type, tensordot, dot
@@ -14,6 +16,44 @@ einsum_symbols = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 einsum_symbols_set = set(einsum_symbols)
 
 
+def _flop_count(idx_contraction, inner, num_terms, size_dictionary):
+    """
+    Computes the number of FLOPS in the contraction.
+
+    Parameters
+    ----------
+    idx_contraction : iterable
+        The indices involved in the contraction
+    inner : bool
+        Does this contraction require an inner product?
+    num_terms : int
+        The number of terms in a contraction
+    size_dictionary : dict
+        The size of each of the indices in idx_contraction
+
+    Returns
+    -------
+    flop_count : int
+        The total number of FLOPS required for the contraction.
+
+    Examples
+    --------
+
+    >>> _flop_count('abc', False, 1, {'a': 2, 'b':3, 'c':5})
+    90
+
+    >>> _flop_count('abc', True, 2, {'a': 2, 'b':3, 'c':5})
+    270
+
+    """
+
+    overall_size = _compute_size_by_dict(idx_contraction, size_dictionary)
+    op_factor = max(1, num_terms - 1)
+    if inner:
+        op_factor += 1
+
+    return overall_size * op_factor
+
 def _compute_size_by_dict(indices, idx_dict):
     """
     Computes the product of the elements in indices based on the dictionary
@@ -139,14 +179,9 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
         iter_results = []
 
         # Compute all unique pairs
-        comb_iter = []
-        for x in range(len(input_sets) - iteration):
-            for y in range(x + 1, len(input_sets) - iteration):
-                comb_iter.append((x, y))
-
         for curr in full_results:
             cost, positions, remaining = curr
-            for con in comb_iter:
+            for con in itertools.combinations(range(len(input_sets) - iteration), 2):
 
                 # Find the contraction
                 cont = _find_contraction(con, remaining, output_set)
@@ -157,15 +192,10 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
                 if new_size > memory_limit:
                     continue
 
-                # Find cost
-                new_cost = _compute_size_by_dict(idx_contract, idx_dict)
-                if idx_removed:
-                    new_cost *= 2
-
                 # Build (total_cost, positions, indices_remaining)
-                new_cost += cost
+                total_cost =  cost + _flop_count(idx_contract, idx_removed, len(con), idx_dict)
                 new_pos = positions + [con]
-                iter_results.append((new_cost, new_pos, new_input_sets))
+                iter_results.append((total_cost, new_pos, new_input_sets))
 
         # Update combinatorial list, if we did not find anything return best
         # path + remaining contractions
@@ -183,6 +213,102 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
     path = min(full_results, key=lambda x: x[0])[1]
     return path
 
+def _parse_possible_contraction(positions, input_sets, output_set, idx_dict, memory_limit, path_cost, naive_cost):
+    """Compute the cost (removed size + flops) and resultant indices for
+    performing the contraction specified by ``positions``.
+
+    Parameters
+    ----------
+    positions : tuple of int
+        The locations of the proposed tensors to contract.
+    input_sets : list of sets
+        The indices found on each tensors.
+    output_set : set
+        The output indices of the expression.
+    idx_dict : dict
+        Mapping of each index to its size.
+    memory_limit : int
+        The total allowed size for an intermediary tensor.
+    path_cost : int
+        The contraction cost so far.
+    naive_cost : int
+        The cost of the unoptimized expression.
+
+    Returns
+    -------
+    cost : (int, int)
+        A tuple containing the size of any indices removed, and the flop cost.
+    positions : tuple of int
+        The locations of the proposed tensors to contract.
+    new_input_sets : list of sets
+        The resulting new list of indices if this proposed contraction is performed.
+
+    """
+
+    # Find the contraction
+    contract = _find_contraction(positions, input_sets, output_set)
+    idx_result, new_input_sets, idx_removed, idx_contract = contract
+
+    # Sieve the results based on memory_limit
+    new_size = _compute_size_by_dict(idx_result, idx_dict)
+    if new_size > memory_limit:
+        return None
+
+    # Build sort tuple
+    old_sizes = (_compute_size_by_dict(input_sets[p], idx_dict) for p in positions)
+    removed_size = sum(old_sizes) - new_size
+
+    # NB: removed_size used to be just the size of any removed indices i.e.:
+    #     helpers.compute_size_by_dict(idx_removed, idx_dict)
+    cost = _flop_count(idx_contract, idx_removed, len(positions), idx_dict)
+    sort = (-removed_size, cost)
+
+    # Sieve based on total cost as well
+    if (path_cost + cost) > naive_cost:
+        return None
+
+    # Add contraction to possible choices
+    return [sort, positions, new_input_sets]
+
+
+def _update_other_results(results, best):
+    """Update the positions and provisional input_sets of ``results`` based on
+    performing the contraction result ``best``. Remove any involving the tensors
+    contracted.
+
+    Parameters
+    ----------
+    results : list
+        List of contraction results produced by ``_parse_possible_contraction``.
+    best : list
+        The best contraction of ``results`` i.e. the one that will be performed.
+
+    Returns
+    -------
+    mod_results : list
+        The list of modifed results, updated with outcome of ``best`` contraction.
+    """
+
+    best_con = best[1]
+    bx, by = best_con
+    mod_results = []
+
+    for cost, (x, y), con_sets in results:
+
+        # Ignore results involving tensors just contracted
+        if x in best_con or y in best_con:
+            continue
+
+        # Update the input_sets
+        del con_sets[by - int(by > x) - int(by > y)]
+        del con_sets[bx - int(bx > x) - int(bx > y)]
+        con_sets.insert(-1, best[2][-1])
+
+        # Update the position indices
+        mod_con = x - int(x > bx) - int(x > by), y - int(y > bx) - int(y > by)
+        mod_results.append((cost, mod_con, con_sets))
+
+    return mod_results
 
 def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
     """
@@ -219,46 +345,68 @@ def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
     [(0, 2), (0, 1)]
     """
 
+    # Handle trivial cases that leaked through
     if len(input_sets) == 1:
         return [(0,)]
+    elif len(input_sets) == 2:
+        return [(0, 1)]
+
+    # Build up a naive cost
+    contract = _find_contraction(range(len(input_sets)), input_sets, output_set)
+    idx_result, new_input_sets, idx_removed, idx_contract = contract
+    naive_cost = _flop_count(idx_contract, idx_removed, len(input_sets), idx_dict)
 
+    # Initially iterate over all pairs
+    comb_iter = itertools.combinations(range(len(input_sets)), 2)
+    known_contractions = []
+
+    path_cost = 0
     path = []
-    for iteration in range(len(input_sets) - 1):
-        iteration_results = []
-        comb_iter = []
 
-        # Compute all unique pairs
-        for x in range(len(input_sets)):
-            for y in range(x + 1, len(input_sets)):
-                comb_iter.append((x, y))
+    for iteration in range(len(input_sets) - 1):
 
+        # Iterate over all pairs on first step, only previously found pairs on subsequent steps
         for positions in comb_iter:
 
-            # Find the contraction
-            contract = _find_contraction(positions, input_sets, output_set)
-            idx_result, new_input_sets, idx_removed, idx_contract = contract
-
-            # Sieve the results based on memory_limit
-            if _compute_size_by_dict(idx_result, idx_dict) > memory_limit:
+            # Always initially ignore outer products
+            if input_sets[positions[0]].isdisjoint(input_sets[positions[1]]):
                 continue
 
-            # Build sort tuple
-            removed_size = _compute_size_by_dict(idx_removed, idx_dict)
-            cost = _compute_size_by_dict(idx_contract, idx_dict)
-            sort = (-removed_size, cost)
+            result = _parse_possible_contraction(positions, input_sets, output_set, idx_dict, memory_limit, path_cost,
+                                                 naive_cost)
+            if result is not None:
+                known_contractions.append(result)
+
+        # If we do not have a inner contraction, rescan pairs including outer products
+        if len(known_contractions) == 0:
 
-            # Add contraction to possible choices
-            iteration_results.append([sort, positions, new_input_sets])
+            # Then check the outer products
+            for positions in itertools.combinations(range(len(input_sets)), 2):
+                result = _parse_possible_contraction(positions, input_sets, output_set, idx_dict, memory_limit,
+                                                     path_cost, naive_cost)
+                if result is not None:
+                    known_contractions.append(result)
 
-        # If we did not find a new contraction contract remaining
-        if len(iteration_results) == 0:
-            path.append(tuple(range(len(input_sets))))
-            break
+            # If we still did not find any remaining contractions, default back to einsum like behavior
+            if len(known_contractions) == 0:
+                path.append(tuple(range(len(input_sets))))
+                break
 
         # Sort based on first index
-        best = min(iteration_results, key=lambda x: x[0])
-        path.append(best[1])
+        best = min(known_contractions, key=lambda x: x[0])
+
+        # Now propagate as many unused contractions as possible to next iteration
+        known_contractions = _update_other_results(known_contractions, best)
+
+        # Next iteration only compute contractions with the new tensor
+        # All other contractions have been accounted for
         input_sets = best[2]
+        new_tensor_pos = len(input_sets) - 1
+        comb_iter = ((i, new_tensor_pos) for i in range(new_tensor_pos))
+
+        # Update path and total cost
+        path.append(best[1])
+        path_cost += best[0][1]
 
     return path
 
@@ -314,26 +462,27 @@ def _can_dot(inputs, result, idx_removed):
     if len(inputs) != 2:
         return False
 
-    # Build a few temporaries
     input_left, input_right = inputs
+
+    for c in set(input_left + input_right):
+        # can't deal with repeated indices on same input or more than 2 total
+        nl, nr = input_left.count(c), input_right.count(c)
+        if (nl > 1) or (nr > 1) or (nl + nr > 2):
+            return False
+
+        # can't do implicit summation or dimension collapse e.g.
+        #     "ab,bc->c" (implicitly sum over 'a')
+        #     "ab,ca->ca" (take diagonal of 'a')
+        if nl + nr - 1 == int(c in result):
+            return False
+
+    # Build a few temporaries
     set_left = set(input_left)
     set_right = set(input_right)
     keep_left = set_left - idx_removed
     keep_right = set_right - idx_removed
     rs = len(idx_removed)
 
-    # Indices must overlap between the two operands
-    if not len(set_left & set_right):
-        return False
-
-    # We cannot have duplicate indices ("ijj, jk -> ik")
-    if (len(set_left) != len(input_left)) or (len(set_right) != len(input_right)):
-        return False
-
-    # Cannot handle partial inner ("ij, ji -> i")
-    if len(keep_left & keep_right):
-        return False
-
     # At this point we are a DOT, GEMV, or GEMM operation
 
     # Handle inner products
@@ -371,6 +520,7 @@ def _can_dot(inputs, result, idx_removed):
     # We are a matrix-matrix product, but we need to copy data
     return True
 
+
 def _parse_einsum_input(operands):
     """
     A reproduction of einsum c side einsum parsing in python.
@@ -697,6 +847,7 @@ def einsum_path(*operands, **kwargs):
 
     # Get length of each unique dimension and ensure all dimensions are correct
     dimension_dict = {}
+    broadcast_indices = [[] for x in range(len(input_list))]
     for tnum, term in enumerate(input_list):
         sh = operands[tnum].shape
         if len(sh) != len(term):
@@ -705,6 +856,11 @@ def einsum_path(*operands, **kwargs):
                              % (input_subscripts[tnum], tnum))
         for cnum, char in enumerate(term):
             dim = sh[cnum]
+
+            # Build out broadcast indices
+            if dim == 1:
+                broadcast_indices[tnum].append(char)
+
             if char in dimension_dict.keys():
                 # For broadcasting cases we always want the largest dim size
                 if dimension_dict[char] == 1:
@@ -716,6 +872,9 @@ def einsum_path(*operands, **kwargs):
             else:
                 dimension_dict[char] = dim
 
+    # Convert broadcast inds to sets
+    broadcast_indices = [set(x) for x in broadcast_indices]
+
     # Compute size of each input array plus the output array
     size_list = []
     for term in input_list + [output_subscript]:
@@ -729,20 +888,14 @@ def einsum_path(*operands, **kwargs):
 
     # Compute naive cost
     # This isn't quite right, need to look into exactly how einsum does this
-    naive_cost = _compute_size_by_dict(indices, dimension_dict)
-    indices_in_input = input_subscripts.replace(',', '')
-    mult = max(len(input_list) - 1, 1)
-    if (len(indices_in_input) - len(set(indices_in_input))):
-        mult *= 2
-    naive_cost *= mult
+    inner_product = (sum(len(x) for x in input_sets) - len(indices)) > 0
+    naive_cost = _flop_count(indices, inner_product, len(input_list), dimension_dict)
 
     # Compute the path
     if (path_type is False) or (len(input_list) in [1, 2]) or (indices == output_set):
         # Nothing to be optimized, leave it to einsum
         path = [tuple(range(len(input_list)))]
     elif path_type == "greedy":
-        # Maximum memory should be at most out_size for this algorithm
-        memory_arg = min(memory_arg, max_size)
         path = _greedy_path(input_sets, output_set, dimension_dict, memory_arg)
     elif path_type == "optimal":
         path = _optimal_path(input_sets, output_set, dimension_dict, memory_arg)
@@ -761,18 +914,24 @@ def einsum_path(*operands, **kwargs):
         contract = _find_contraction(contract_inds, input_sets, output_set)
         out_inds, input_sets, idx_removed, idx_contract = contract
 
-        cost = _compute_size_by_dict(idx_contract, dimension_dict)
-        if idx_removed:
-            cost *= 2
+        cost = _flop_count(idx_contract, idx_removed, len(contract_inds), dimension_dict)
         cost_list.append(cost)
         scale_list.append(len(idx_contract))
         size_list.append(_compute_size_by_dict(out_inds, dimension_dict))
 
+        bcast = set()
         tmp_inputs = []
         for x in contract_inds:
             tmp_inputs.append(input_list.pop(x))
+            bcast |= broadcast_indices.pop(x)
 
-        do_blas = _can_dot(tmp_inputs, out_inds, idx_removed)
+        new_bcast_inds = bcast - idx_removed
+
+        # If we're broadcasting, nix blas
+        if not len(idx_removed & bcast):
+            do_blas = _can_dot(tmp_inputs, out_inds, idx_removed)
+        else:
+            do_blas = False
 
         # Last contraction
         if (cnum - len(path)) == -1:
@@ -782,6 +941,7 @@ def einsum_path(*operands, **kwargs):
             idx_result = "".join([x[1] for x in sorted(sort_result)])
 
         input_list.append(idx_result)
+        broadcast_indices.append(new_bcast_inds)
         einsum_str = ",".join(tmp_inputs) + "->" + idx_result
 
         contraction = (contract_inds, idx_removed, einsum_str, input_list[:], do_blas)
@@ -828,19 +988,27 @@ def einsum(*operands, **kwargs):
 
     Evaluates the Einstein summation convention on the operands.
 
-    Using the Einstein summation convention, many common multi-dimensional
-    array operations can be represented in a simple fashion.  This function
-    provides a way to compute such summations. The best way to understand this
-    function is to try the examples below, which show how many common NumPy
-    functions can be implemented as calls to `einsum`.
+    Using the Einstein summation convention, many common multi-dimensional,
+    linear algebraic array operations can be represented in a simple fashion.
+    In *implicit* mode `einsum` computes these values.
+
+    In *explicit* mode, `einsum` provides further flexibility to compute
+    other array operations that might not be considered classical Einstein
+    summation operations, by disabling, or forcing summation over specified
+    subscript labels.
+
+    See the notes and examples for clarification.
 
     Parameters
     ----------
     subscripts : str
-        Specifies the subscripts for summation.
+        Specifies the subscripts for summation as comma separated list of
+        subscript labels. An implicit (classical Einstein summation)
+        calculation is performed unless the explicit indicator '->' is
+        included as well as subscript labels of the precise output form.
     operands : list of array_like
         These are the arrays for the operation.
-    out : {ndarray, None}, optional
+    out : ndarray, optional
         If provided, the calculation is done into this array.
     dtype : {data-type, None}, optional
         If provided, forces the calculation to use the data type specified.
@@ -869,7 +1037,7 @@ def einsum(*operands, **kwargs):
         Controls if intermediate optimization should occur. No optimization
         will occur if False and True will default to the 'greedy' algorithm.
         Also accepts an explicit contraction list from the ``np.einsum_path``
-        function. See ``np.einsum_path`` for more details. Default is True.
+        function. See ``np.einsum_path`` for more details. Defaults to False.
 
     Returns
     -------
@@ -884,50 +1052,80 @@ def einsum(*operands, **kwargs):
     -----
     .. versionadded:: 1.6.0
 
-    The subscripts string is a comma-separated list of subscript labels,
-    where each label refers to a dimension of the corresponding operand.
-    Repeated subscripts labels in one operand take the diagonal.  For example,
-    ``np.einsum('ii', a)`` is equivalent to ``np.trace(a)``.
+    The Einstein summation convention can be used to compute
+    many multi-dimensional, linear algebraic array operations. `einsum`
+    provides a succinct way of representing these.
 
-    Whenever a label is repeated, it is summed, so ``np.einsum('i,i', a, b)``
-    is equivalent to ``np.inner(a,b)``.  If a label appears only once,
-    it is not summed, so ``np.einsum('i', a)`` produces a view of ``a``
-    with no changes.
+    A non-exhaustive list of these operations,
+    which can be computed by `einsum`, is shown below along with examples:
 
-    The order of labels in the output is by default alphabetical.  This
-    means that ``np.einsum('ij', a)`` doesn't affect a 2D array, while
-    ``np.einsum('ji', a)`` takes its transpose.
+    * Trace of an array, :py:func:`numpy.trace`.
+    * Return a diagonal, :py:func:`numpy.diag`.
+    * Array axis summations, :py:func:`numpy.sum`.
+    * Transpositions and permutations, :py:func:`numpy.transpose`.
+    * Matrix multiplication and dot product, :py:func:`numpy.matmul` :py:func:`numpy.dot`.
+    * Vector inner and outer products, :py:func:`numpy.inner` :py:func:`numpy.outer`.
+    * Broadcasting, element-wise and scalar multiplication, :py:func:`numpy.multiply`.
+    * Tensor contractions, :py:func:`numpy.tensordot`.
+    * Chained array operations, in efficient calculation order, :py:func:`numpy.einsum_path`.
 
-    The output can be controlled by specifying output subscript labels
-    as well.  This specifies the label order, and allows summing to
-    be disallowed or forced when desired.  The call ``np.einsum('i->', a)``
-    is like ``np.sum(a, axis=-1)``, and ``np.einsum('ii->i', a)``
-    is like ``np.diag(a)``.  The difference is that `einsum` does not
-    allow broadcasting by default.
+    The subscripts string is a comma-separated list of subscript labels,
+    where each label refers to a dimension of the corresponding operand.
+    Whenever a label is repeated it is summed, so ``np.einsum('i,i', a, b)``
+    is equivalent to :py:func:`np.inner(a,b) <numpy.inner>`. If a label
+    appears only once, it is not summed, so ``np.einsum('i', a)`` produces a
+    view of ``a`` with no changes. A further example ``np.einsum('ij,jk', a, b)``
+    describes traditional matrix multiplication and is equivalent to
+    :py:func:`np.matmul(a,b) <numpy.matmul>`. Repeated subscript labels in one
+    operand take the diagonal. For example, ``np.einsum('ii', a)`` is equivalent
+    to :py:func:`np.trace(a) <numpy.trace>`.
+
+    In *implicit mode*, the chosen subscripts are important
+    since the axes of the output are reordered alphabetically.  This
+    means that ``np.einsum('ij', a)`` doesn't affect a 2D array, while
+    ``np.einsum('ji', a)`` takes its transpose. Additionally,
+    ``np.einsum('ij,jk', a, b)`` returns a matrix multiplication, while,
+    ``np.einsum('ij,jh', a, b)`` returns the transpose of the
+    multiplication since subscript 'h' precedes subscript 'i'.
+
+    In *explicit mode* the output can be directly controlled by
+    specifying output subscript labels.  This requires the
+    identifier '->' as well as the list of output subscript labels.
+    This feature increases the flexibility of the function since
+    summing can be disabled or forced when required. The call
+    ``np.einsum('i->', a)`` is like :py:func:`np.sum(a, axis=-1) <numpy.sum>`,
+    and ``np.einsum('ii->i', a)`` is like :py:func:`np.diag(a) <numpy.diag>`.
+    The difference is that `einsum` does not allow broadcasting by default.
+    Additionally ``np.einsum('ij,jh->ih', a, b)`` directly specifies the
+    order of the output subscript labels and therefore returns matrix
+    multiplication, unlike the example above in implicit mode.
 
     To enable and control broadcasting, use an ellipsis.  Default
     NumPy-style broadcasting is done by adding an ellipsis
     to the left of each term, like ``np.einsum('...ii->...i', a)``.
     To take the trace along the first and last axes,
     you can do ``np.einsum('i...i', a)``, or to do a matrix-matrix
-    product with the left-most indices instead of rightmost, you can do
+    product with the left-most indices instead of rightmost, one can do
     ``np.einsum('ij...,jk...->ik...', a, b)``.
 
     When there is only one operand, no axes are summed, and no output
     parameter is provided, a view into the operand is returned instead
     of a new array.  Thus, taking the diagonal as ``np.einsum('ii->i', a)``
-    produces a view.
+    produces a view (changed in version 1.10.0).
 
-    An alternative way to provide the subscripts and operands is as
-    ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``. The examples
-    below have corresponding `einsum` calls with the two parameter methods.
+    `einsum` also provides an alternative way to provide the subscripts
+    and operands as ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``.
+    If the output shape is not provided in this format `einsum` will be
+    calculated in implicit mode, otherwise it will be performed explicitly.
+    The examples below have corresponding `einsum` calls with the two
+    parameter methods.
 
     .. versionadded:: 1.10.0
 
     Views returned from einsum are now writeable whenever the input array
     is writeable. For example, ``np.einsum('ijk...->kji...', a)`` will now
-    have the same effect as ``np.swapaxes(a, 0, 2)`` and
-    ``np.einsum('ii->i', a)`` will return a writeable view of the diagonal
+    have the same effect as :py:func:`np.swapaxes(a, 0, 2) <numpy.swapaxes>`
+    and ``np.einsum('ii->i', a)`` will return a writeable view of the diagonal
     of a 2D array.
 
     .. versionadded:: 1.12.0
@@ -937,7 +1135,14 @@ def einsum(*operands, **kwargs):
     can greatly increase the computational efficiency at the cost of a larger
     memory footprint during computation.
 
-    See ``np.einsum_path`` for more details.
+    Typically a 'greedy' algorithm is applied which empirical tests have shown
+    returns the optimal path in the majority of cases. In some cases 'optimal'
+    will return the superlative path through a more expensive, exhaustive search.
+    For iterative calculations it may be advisable to calculate the optimal path
+    once and reuse that path by supplying it as an argument. An example is given
+    below.
+
+    See :py:func:`numpy.einsum_path` for more details.
 
     Examples
     --------
@@ -945,6 +1150,8 @@ def einsum(*operands, **kwargs):
     >>> b = np.arange(5)
     >>> c = np.arange(6).reshape(2,3)
 
+    Trace of a matrix:
+
     >>> np.einsum('ii', a)
     60
     >>> np.einsum(a, [0,0])
@@ -952,6 +1159,8 @@ def einsum(*operands, **kwargs):
     >>> np.trace(a)
     60
 
+    Extract the diagonal (requires explicit form):
+
     >>> np.einsum('ii->i', a)
     array([ 0,  6, 12, 18, 24])
     >>> np.einsum(a, [0,0], [0])
@@ -959,32 +1168,67 @@ def einsum(*operands, **kwargs):
     >>> np.diag(a)
     array([ 0,  6, 12, 18, 24])
 
-    >>> np.einsum('ij,j', a, b)
-    array([ 30,  80, 130, 180, 230])
-    >>> np.einsum(a, [0,1], b, [1])
-    array([ 30,  80, 130, 180, 230])
-    >>> np.dot(a, b)
-    array([ 30,  80, 130, 180, 230])
-    >>> np.einsum('...j,j', a, b)
-    array([ 30,  80, 130, 180, 230])
+    Sum over an axis (requires explicit form):
+
+    >>> np.einsum('ij->i', a)
+    array([ 10,  35,  60,  85, 110])
+    >>> np.einsum(a, [0,1], [0])
+    array([ 10,  35,  60,  85, 110])
+    >>> np.sum(a, axis=1)
+    array([ 10,  35,  60,  85, 110])
+
+    For higher dimensional arrays summing a single axis can be done with ellipsis:
+
+    >>> np.einsum('...j->...', a)
+    array([ 10,  35,  60,  85, 110])
+    >>> np.einsum(a, [Ellipsis,1], [Ellipsis])
+    array([ 10,  35,  60,  85, 110])
+
+    Compute a matrix transpose, or reorder any number of axes:
 
     >>> np.einsum('ji', c)
     array([[0, 3],
            [1, 4],
            [2, 5]])
+    >>> np.einsum('ij->ji', c)
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
     >>> np.einsum(c, [1,0])
     array([[0, 3],
            [1, 4],
            [2, 5]])
-    >>> c.T
+    >>> np.transpose(c)
     array([[0, 3],
            [1, 4],
            [2, 5]])
 
+    Vector inner products:
+
+    >>> np.einsum('i,i', b, b)
+    30
+    >>> np.einsum(b, [0], b, [0])
+    30
+    >>> np.inner(b,b)
+    30
+
+    Matrix vector multiplication:
+
+    >>> np.einsum('ij,j', a, b)
+    array([ 30,  80, 130, 180, 230])
+    >>> np.einsum(a, [0,1], b, [1])
+    array([ 30,  80, 130, 180, 230])
+    >>> np.dot(a, b)
+    array([ 30,  80, 130, 180, 230])
+    >>> np.einsum('...j,j', a, b)
+    array([ 30,  80, 130, 180, 230])
+
+    Broadcasting and scalar multiplication:
+
     >>> np.einsum('..., ...', 3, c)
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
-    >>> np.einsum(',ij', 3, C)
+    >>> np.einsum(',ij', 3, c)
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
     >>> np.einsum(3, [Ellipsis], c, [Ellipsis])
@@ -994,12 +1238,7 @@ def einsum(*operands, **kwargs):
     array([[ 0,  3,  6],
            [ 9, 12, 15]])
 
-    >>> np.einsum('i,i', b, b)
-    30
-    >>> np.einsum(b, [0], b, [0])
-    30
-    >>> np.inner(b,b)
-    30
+    Vector outer product:
 
     >>> np.einsum('i,j', np.arange(2)+1, b)
     array([[0, 1, 2, 3, 4],
@@ -1011,12 +1250,7 @@ def einsum(*operands, **kwargs):
     array([[0, 1, 2, 3, 4],
            [0, 2, 4, 6, 8]])
 
-    >>> np.einsum('i...->...', a)
-    array([50, 55, 60, 65, 70])
-    >>> np.einsum(a, [0,Ellipsis], [Ellipsis])
-    array([50, 55, 60, 65, 70])
-    >>> np.sum(a, axis=0)
-    array([50, 55, 60, 65, 70])
+    Tensor contraction:
 
     >>> a = np.arange(60.).reshape(3,4,5)
     >>> b = np.arange(24.).reshape(4,3,2)
@@ -1039,6 +1273,17 @@ def einsum(*operands, **kwargs):
            [ 4796.,  5162.],
            [ 4928.,  5306.]])
 
+    Writeable returned arrays (since version 1.10.0):
+
+    >>> a = np.zeros((3, 3))
+    >>> np.einsum('ii->i', a)[:] = 1
+    >>> a
+    array([[ 1.,  0.,  0.],
+           [ 0.,  1.,  0.],
+           [ 0.,  0.,  1.]])
+
+    Example of ellipsis use:
+
     >>> a = np.arange(6).reshape((3,2))
     >>> b = np.arange(12).reshape((4,3))
     >>> np.einsum('ki,jk->ij', a, b)
@@ -1051,13 +1296,26 @@ def einsum(*operands, **kwargs):
     array([[10, 28, 46, 64],
            [13, 40, 67, 94]])
 
-    >>> # since version 1.10.0
-    >>> a = np.zeros((3, 3))
-    >>> np.einsum('ii->i', a)[:] = 1
-    >>> a
-    array([[ 1.,  0.,  0.],
-           [ 0.,  1.,  0.],
-           [ 0.,  0.,  1.]])
+    Chained array operations. For more complicated contractions, speed ups
+    might be achieved by repeatedly computing a 'greedy' path or pre-computing the
+    'optimal' path and repeatedly applying it, using an
+    `einsum_path` insertion (since version 1.12.0). Performance improvements can be
+    particularly significant with larger arrays:
+
+    >>> a = np.ones(64).reshape(2,4,8)
+    # Basic `einsum`: ~1520ms  (benchmarked on 3.1GHz Intel i5.)
+    >>> for iteration in range(500):
+    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a)
+    # Sub-optimal `einsum` (due to repeated path calculation time): ~330ms
+    >>> for iteration in range(500):
+    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')
+    # Greedy `einsum` (faster optimal path approximation): ~160ms
+    >>> for iteration in range(500):
+    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='greedy')
+    # Optimal `einsum` (best usage pattern in some use cases): ~110ms
+    >>> path = np.einsum_path('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')[0]
+    >>> for iteration in range(500):
+    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize=path)
 
     """
 
@@ -1101,25 +1359,14 @@ def einsum(*operands, **kwargs):
             tmp_operands.append(operands.pop(x))
 
         # Do we need to deal with the output?
-        if specified_out and ((num + 1) == len(contraction_list)):
-            handle_out = True
+        handle_out = specified_out and ((num + 1) == len(contraction_list))
 
-        # Handle broadcasting vs BLAS cases
+        # Call tensordot if still possible
         if blas:
             # Checks have already been handled
             input_str, results_index = einsum_str.split('->')
             input_left, input_right = input_str.split(',')
-            if 1 in tmp_operands[0].shape or 1 in tmp_operands[1].shape:
-                left_dims = {dim: size for dim, size in
-                             zip(input_left, tmp_operands[0].shape)}
-                right_dims = {dim: size for dim, size in
-                              zip(input_right, tmp_operands[1].shape)}
-                # If dims do not match we are broadcasting, BLAS off
-                if any(left_dims[ind] != right_dims[ind] for ind in idx_rm):
-                    blas = False
 
-        # Call tensordot if still possible
-        if blas:
             tensor_result = input_left + input_right
             for s in idx_rm:
                 tensor_result = tensor_result.replace(s, "")
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index d1aae0aa0..b9cc98cae 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -1198,6 +1198,16 @@ def resize(a, new_shape):
     --------
     ndarray.resize : resize an array in-place.
 
+    Notes
+    -----
+    Warning: This functionality does **not** consider axes separately,
+    i.e. it does not apply interpolation/extrapolation.
+    It fills the return array with the required number of elements, taken
+    from `a` as they are laid out in memory, disregarding strides and axes.
+    (This is in case the new shape is smaller. For larger, see above.)
+    This functionality is therefore not suitable to resize images,
+    or data where each axis represents a separate and distinct entity.
+
     Examples
     --------
     >>> a=np.array([[0,1],[2,3]])
@@ -1615,16 +1625,16 @@ def nonzero(a):
 
     Examples
     --------
-    >>> x = np.array([[1,0,0], [0,2,0], [1,1,0]])
+    >>> x = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]])
     >>> x
-    array([[1, 0, 0],
-           [0, 2, 0],
-           [1, 1, 0]])
+    array([[3, 0, 0],
+           [0, 4, 0],
+           [5, 6, 0]])
     >>> np.nonzero(x)
     (array([0, 1, 2, 2]), array([0, 1, 0, 1]))
 
     >>> x[np.nonzero(x)]
-    array([1, 2, 1, 1])
+    array([3, 4, 5, 6])
     >>> np.transpose(np.nonzero(x))
     array([[0, 0],
            [1, 1],
@@ -1636,7 +1646,7 @@ def nonzero(a):
     boolean array and since False is interpreted as 0, np.nonzero(a > 3)
     yields the indices of the `a` where the condition is true.
 
-    >>> a = np.array([[1,2,3],[4,5,6],[7,8,9]])
+    >>> a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     >>> a > 3
     array([[False, False, False],
            [ True,  True,  True],
@@ -1644,7 +1654,14 @@ def nonzero(a):
     >>> np.nonzero(a > 3)
     (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2]))
 
-    The ``nonzero`` method of the boolean array can also be called.
+    Using this result to index `a` is equivalent to using the mask directly:
+
+    >>> a[np.nonzero(a > 3)]
+    array([4, 5, 6, 7, 8, 9])
+    >>> a[a > 3]  # prefer this spelling
+    array([4, 5, 6, 7, 8, 9])
+
+    ``nonzero`` can also be called as a method of the array.
 
     >>> (a > 3).nonzero()
     (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2]))
@@ -2777,11 +2794,11 @@ def around(a, decimals=0, out=None):
 
     References
     ----------
-    .. [1] "Lecture Notes on the Status of  IEEE 754", William Kahan,
-           http://www.cs.berkeley.edu/~wkahan/ieee754status/IEEE754.PDF
+    .. [1] "Lecture Notes on the Status of IEEE 754", William Kahan,
+           https://people.eecs.berkeley.edu/~wkahan/ieee754status/IEEE754.PDF
     .. [2] "How Futile are Mindless Assessments of
            Roundoff in Floating-Point Computation?", William Kahan,
-           http://www.cs.berkeley.edu/~wkahan/Mindless.pdf
+           https://people.eecs.berkeley.edu/~wkahan/Mindless.pdf
 
     Examples
     --------
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index 82de1a36e..fb72bada5 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -6,6 +6,7 @@ import operator
 from . import numeric as _nx
 from .numeric import (result_type, NaN, shares_memory, MAY_SHARE_BOUNDS,
                       TooHardError,asanyarray)
+from numpy.core.multiarray import add_docstring
 
 __all__ = ['logspace', 'linspace', 'geomspace']
 
@@ -356,3 +357,38 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None):
                                  endpoint=endpoint, base=10.0, dtype=dtype)
 
     return result.astype(dtype)
+
+
+#always succeed
+def add_newdoc(place, obj, doc):
+    """
+    Adds documentation to obj which is in module place.
+
+    If doc is a string add it to obj as a docstring
+
+    If doc is a tuple, then the first element is interpreted as
+       an attribute of obj and the second as the docstring
+          (method, docstring)
+
+    If doc is a list, then each element of the list should be a
+       sequence of length two --> [(method1, docstring1),
+       (method2, docstring2), ...]
+
+    This routine never raises an error.
+
+    This routine cannot modify read-only docstrings, as appear
+    in new-style classes or built-in functions. Because this
+    routine never raises an error the caller must check manually
+    that the docstrings were changed.
+    """
+    try:
+        new = getattr(__import__(place, globals(), {}, [obj]), obj)
+        if isinstance(doc, str):
+            add_docstring(new, doc.strip())
+        elif isinstance(doc, tuple):
+            add_docstring(getattr(new, doc[0]), doc[1].strip())
+        elif isinstance(doc, list):
+            for val in doc:
+                add_docstring(getattr(new, val[0]), val[1].strip())
+    except Exception:
+        pass
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index cf73cecea..ec2893b21 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -1759,7 +1759,7 @@ typedef struct {
 /************************************************************
  * This is the form of the struct that's returned pointed by the
  * PyCObject attribute of an array __array_struct__. See
- * http://docs.scipy.org/doc/numpy/reference/arrays.interface.html for the full
+ * https://docs.scipy.org/doc/numpy/reference/arrays.interface.html for the full
  * documentation.
  ************************************************************/
 typedef struct {
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index 106ffa450..5edd8f42e 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -39,17 +39,19 @@
      * _M_AMD64 defined by MS compiler
      */
     #define NPY_CPU_AMD64
+#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__)
+    #define NPY_CPU_PPC64LE
+#elif defined(__powerpc64__) && defined(__BIG_ENDIAN__)
+    #define NPY_CPU_PPC64
 #elif defined(__ppc__) || defined(__powerpc__) || defined(_ARCH_PPC)
     /*
      * __ppc__ is defined by gcc, I remember having seen __powerpc__ once,
      * but can't find it ATM
      * _ARCH_PPC is used by at least gcc on AIX
+     * As __powerpc__ and _ARCH_PPC are also defined by PPC64 check
+     * for those specifically first before defaulting to ppc
      */
     #define NPY_CPU_PPC
-#elif defined(__ppc64le__)
-    #define NPY_CPU_PPC64LE
-#elif defined(__ppc64__)
-    #define NPY_CPU_PPC64
 #elif defined(__sparc__) || defined(__sparc)
     /* __sparc__ is defined by gcc and Forte (e.g. Sun) compilers */
     #define NPY_CPU_SPARC
@@ -61,10 +63,27 @@
     #define NPY_CPU_HPPA
 #elif defined(__alpha__)
     #define NPY_CPU_ALPHA
-#elif defined(__arm__) && defined(__ARMEL__)
-    #define NPY_CPU_ARMEL
-#elif defined(__arm__) && defined(__ARMEB__)
-    #define NPY_CPU_ARMEB
+#elif defined(__arm__) || defined(__aarch64__)
+    #if defined(__ARMEB__) || defined(__AARCH64EB__)
+        #if defined(__ARM_32BIT_STATE)
+            #define NPY_CPU_ARMEB_AARCH32
+        #elif defined(__ARM_64BIT_STATE)
+            #define NPY_CPU_ARMEB_AARCH64
+        #else
+            #define NPY_CPU_ARMEB
+        #endif
+    #elif defined(__ARMEL__) || defined(__AARCH64EL__)
+        #if defined(__ARM_32BIT_STATE)
+            #define NPY_CPU_ARMEL_AARCH32
+        #elif defined(__ARM_64BIT_STATE)
+            #define NPY_CPU_ARMEL_AARCH64
+        #else
+            #define NPY_CPU_ARMEL
+        #endif
+    #else
+        # error Unknown ARM CPU, please report this to numpy maintainers with \
+	information about your platform (OS, CPU and compiler)
+    #endif
 #elif defined(__sh__) && defined(__LITTLE_ENDIAN__)
     #define NPY_CPU_SH_LE
 #elif defined(__sh__) && defined(__BIG_ENDIAN__)
@@ -75,8 +94,6 @@
     #define NPY_CPU_MIPSEB
 #elif defined(__or1k__)
     #define NPY_CPU_OR1K
-#elif defined(__aarch64__)
-    #define NPY_CPU_AARCH64
 #elif defined(__mc68000__)
     #define NPY_CPU_M68K
 #elif defined(__arc__) && defined(__LITTLE_ENDIAN__)
diff --git a/numpy/core/include/numpy/npy_endian.h b/numpy/core/include/numpy/npy_endian.h
index 649bdb0a6..44cdffd14 100644
--- a/numpy/core/include/numpy/npy_endian.h
+++ b/numpy/core/include/numpy/npy_endian.h
@@ -37,28 +37,31 @@
     #define NPY_LITTLE_ENDIAN 1234
     #define NPY_BIG_ENDIAN 4321
 
-    #if defined(NPY_CPU_X86)            \
-            || defined(NPY_CPU_AMD64)   \
-            || defined(NPY_CPU_IA64)    \
-            || defined(NPY_CPU_ALPHA)   \
-            || defined(NPY_CPU_ARMEL)   \
-            || defined(NPY_CPU_AARCH64) \
-            || defined(NPY_CPU_SH_LE)   \
-            || defined(NPY_CPU_MIPSEL)  \
-            || defined(NPY_CPU_PPC64LE) \
-            || defined(NPY_CPU_ARCEL)   \
+    #if defined(NPY_CPU_X86)                  \
+            || defined(NPY_CPU_AMD64)         \
+            || defined(NPY_CPU_IA64)          \
+            || defined(NPY_CPU_ALPHA)         \
+            || defined(NPY_CPU_ARMEL)         \
+            || defined(NPY_CPU_ARMEL_AARCH32) \
+            || defined(NPY_CPU_ARMEL_AARCH64) \
+            || defined(NPY_CPU_SH_LE)         \
+            || defined(NPY_CPU_MIPSEL)        \
+            || defined(NPY_CPU_PPC64LE)       \
+            || defined(NPY_CPU_ARCEL)         \
             || defined(NPY_CPU_RISCV64)
         #define NPY_BYTE_ORDER NPY_LITTLE_ENDIAN
-    #elif defined(NPY_CPU_PPC)          \
-            || defined(NPY_CPU_SPARC)   \
-            || defined(NPY_CPU_S390)    \
-            || defined(NPY_CPU_HPPA)    \
-            || defined(NPY_CPU_PPC64)   \
-            || defined(NPY_CPU_ARMEB)   \
-            || defined(NPY_CPU_SH_BE)   \
-            || defined(NPY_CPU_MIPSEB)  \
-            || defined(NPY_CPU_OR1K)    \
-            || defined(NPY_CPU_M68K)    \
+    #elif defined(NPY_CPU_PPC)                \
+            || defined(NPY_CPU_SPARC)         \
+            || defined(NPY_CPU_S390)          \
+            || defined(NPY_CPU_HPPA)          \
+            || defined(NPY_CPU_PPC64)         \
+            || defined(NPY_CPU_ARMEB)         \
+            || defined(NPY_CPU_ARMEB_AARCH32) \
+            || defined(NPY_CPU_ARMEB_AARCH64) \
+            || defined(NPY_CPU_SH_BE)         \
+            || defined(NPY_CPU_MIPSEB)        \
+            || defined(NPY_CPU_OR1K)          \
+            || defined(NPY_CPU_M68K)          \
             || defined(NPY_CPU_ARCEB)
         #define NPY_BYTE_ORDER NPY_BIG_ENDIAN
     #else
diff --git a/numpy/core/include/numpy/numpyconfig.h b/numpy/core/include/numpy/numpyconfig.h
index 04a3738b9..ab198f36b 100644
--- a/numpy/core/include/numpy/numpyconfig.h
+++ b/numpy/core/include/numpy/numpyconfig.h
@@ -36,5 +36,6 @@
 #define NPY_1_12_API_VERSION 0x00000008
 #define NPY_1_13_API_VERSION 0x00000008
 #define NPY_1_14_API_VERSION 0x00000008
+#define NPY_1_15_API_VERSION 0x00000008
 
 #endif
diff --git a/numpy/core/memmap.py b/numpy/core/memmap.py
index b2ff0e793..536fa6094 100644
--- a/numpy/core/memmap.py
+++ b/numpy/core/memmap.py
@@ -236,6 +236,7 @@ class memmap(ndarray):
                 raise ValueError("Size of available data is not a "
                         "multiple of the data-type size.")
             size = bytes // _dbytes
+            shape = (size,)
         else:
             if not isinstance(shape, tuple):
                 shape = (shape,)
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 7ade3d224..e5570791a 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1010,7 +1010,8 @@ def convolve(a, v, mode='full'):
 
     References
     ----------
-    .. [1] Wikipedia, "Convolution", http://en.wikipedia.org/wiki/Convolution.
+    .. [1] Wikipedia, "Convolution",
+        https://en.wikipedia.org/wiki/Convolution
 
     Examples
     --------
@@ -1508,11 +1509,14 @@ def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False):
     --------
     normalize_axis_index : normalizing a single scalar axis
     """
-    try:
-        axis = [operator.index(axis)]
-    except TypeError:
-        axis = tuple(axis)
-    axis = tuple(normalize_axis_index(ax, ndim, argname) for ax in axis)
+    # Optimization to speed-up the most common cases.
+    if type(axis) not in (tuple, list):
+        try:
+            axis = [operator.index(axis)]
+        except TypeError:
+            pass
+    # Going via an iterator directly is slower than via list comprehension.
+    axis = tuple([normalize_axis_index(ax, ndim, argname) for ax in axis])
     if not allow_duplicate and len(set(axis)) != len(axis):
         if argname:
             raise ValueError('repeated axis in `{}` argument'.format(argname))
@@ -1896,7 +1900,7 @@ def fromfunction(function, shape, **kwargs):
         The result of the call to `function` is passed back directly.
         Therefore the shape of `fromfunction` is completely determined by
         `function`.  If `function` returns a scalar value, the shape of
-        `fromfunction` would match the `shape` parameter.
+        `fromfunction` would not match the `shape` parameter.
 
     See Also
     --------
@@ -2015,7 +2019,7 @@ def binary_repr(num, width=None):
     References
     ----------
     .. [1] Wikipedia, "Two's complement",
-        http://en.wikipedia.org/wiki/Two's_complement
+        https://en.wikipedia.org/wiki/Two's_complement
 
     Examples
     --------
@@ -2538,7 +2542,7 @@ def seterr(all=None, divide=None, over=None, under=None, invalid=None):
     - Invalid operation: result is not an expressible number, typically
       indicates that a NaN was produced.
 
-    .. [1] http://en.wikipedia.org/wiki/IEEE_754
+    .. [1] https://en.wikipedia.org/wiki/IEEE_754
 
     Examples
     --------
@@ -2914,15 +2918,13 @@ True_ = bool_(True)
 
 
 def extend_all(module):
-    adict = {}
-    for a in __all__:
-        adict[a] = 1
+    existing = set(__all__)
     try:
         mall = getattr(module, '__all__')
     except AttributeError:
         mall = [k for k in module.__dict__.keys() if not k.startswith('_')]
     for a in mall:
-        if a not in adict:
+        if a not in existing:
             __all__.append(a)
 
 
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index f7f25dd95..817af4c7b 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -92,7 +92,7 @@ from numpy.core.multiarray import (
         datetime_as_string, busday_offset, busday_count, is_busday,
         busdaycalendar
         )
-
+from numpy._globals import VisibleDeprecationWarning
 
 # we add more at the bottom
 __all__ = ['sctypeDict', 'sctypeNA', 'typeDict', 'typeNA', 'sctypes',
@@ -210,8 +210,33 @@ def english_capitalize(s):
 
 
 sctypeDict = {}      # Contains all leaf-node scalar types with aliases
-sctypeNA = {}        # Contails all leaf-node types -> numarray type equivalences
-allTypes = {}      # Collect the types we will add to the module here
+class TypeNADict(dict):
+    def __getitem__(self, key):
+        # 2018-06-24, 1.16
+        warnings.warn('sctypeNA and typeNA will be removed in v1.18 '
+                      'of numpy', VisibleDeprecationWarning, stacklevel=2)
+        return dict.__getitem__(self, key)
+    def get(self, key, default=None):
+        # 2018-06-24, 1.16
+        warnings.warn('sctypeNA and typeNA will be removed in v1.18 '
+                      'of numpy', VisibleDeprecationWarning, stacklevel=2)
+        return dict.get(self, key, default)
+
+sctypeNA = TypeNADict()  # Contails all leaf-node types -> numarray type equivalences
+allTypes = {}            # Collect the types we will add to the module here
+
+
+# separate the actual type info from the abtract base classes
+_abstract_types = {}
+_concrete_typeinfo = {}
+for k, v in typeinfo.items():
+    # make all the keys lowercase too
+    k = english_lower(k)
+    if isinstance(v, type):
+        _abstract_types[k] = v
+    else:
+        _concrete_typeinfo[k] = v
+
 
 def _evalname(name):
     k = 0
@@ -236,7 +261,7 @@ def bitname(obj):
             newname = name[:-1]
         else:
             newname = name
-        info = typeinfo[english_upper(newname)]
+        info = _concrete_typeinfo[english_lower(newname)]
         assert(info.type == obj)  # sanity check
         bits = info.bits
 
@@ -283,71 +308,79 @@ def bitname(obj):
 
 
 def _add_types():
-    for type_name, info in typeinfo.items():
-        name = english_lower(type_name)
-        if not isinstance(info, type):
-            # define C-name and insert typenum and typechar references also
-            allTypes[name] = info.type
-            sctypeDict[name] = info.type
-            sctypeDict[info.char] = info.type
-            sctypeDict[info.num] = info.type
-
-        else:  # generic class
-            allTypes[name] = info
+    for name, info in _concrete_typeinfo.items():
+        # define C-name and insert typenum and typechar references also
+        allTypes[name] = info.type
+        sctypeDict[name] = info.type
+        sctypeDict[info.char] = info.type
+        sctypeDict[info.num] = info.type
+
+    for name, cls in _abstract_types.items():
+        allTypes[name] = cls
 _add_types()
 
+# This is the priority order used to assign the bit-sized NPY_INTxx names, which
+# must match the order in npy_common.h in order for NPY_INTxx and np.intxx to be
+# consistent.
+# If two C types have the same size, then the earliest one in this list is used
+# as the sized name.
+_int_ctypes = ['long', 'longlong', 'int', 'short', 'byte']
+_uint_ctypes = list('u' + t for t in _int_ctypes)
+
 def _add_aliases():
-    for type_name, info in typeinfo.items():
-        if isinstance(info, type):
+    for name, info in _concrete_typeinfo.items():
+        # these are handled by _add_integer_aliases
+        if name in _int_ctypes or name in _uint_ctypes:
             continue
-        name = english_lower(type_name)
 
         # insert bit-width version for this class (if relevant)
         base, bit, char = bitname(info.type)
-        if base[-3:] == 'int' or char[0] in 'ui':
+
+        assert base != ''
+        myname = "%s%d" % (base, bit)
+
+        # ensure that (c)longdouble does not overwrite the aliases assigned to
+        # (c)double
+        if name in ('longdouble', 'clongdouble') and myname in allTypes:
             continue
-        if base != '':
-            myname = "%s%d" % (base, bit)
-            if (name not in ('longdouble', 'clongdouble') or
-                   myname not in allTypes):
-                base_capitalize = english_capitalize(base)
-                if base == 'complex':
-                    na_name = '%s%d' % (base_capitalize, bit//2)
-                elif base == 'bool':
-                    na_name = base_capitalize
-                else:
-                    na_name = "%s%d" % (base_capitalize, bit)
 
-                allTypes[myname] = info.type
+        base_capitalize = english_capitalize(base)
+        if base == 'complex':
+            na_name = '%s%d' % (base_capitalize, bit//2)
+        elif base == 'bool':
+            na_name = base_capitalize
+        else:
+            na_name = "%s%d" % (base_capitalize, bit)
+
+        allTypes[myname] = info.type
+
+        # add mapping for both the bit name and the numarray name
+        sctypeDict[myname] = info.type
+        sctypeDict[na_name] = info.type
 
-                # add mapping for both the bit name and the numarray name
-                sctypeDict[myname] = info.type
-                sctypeDict[na_name] = info.type
+        # add forward, reverse, and string mapping to numarray
+        sctypeNA[na_name] = info.type
+        sctypeNA[info.type] = na_name
+        sctypeNA[info.char] = na_name
 
-                # add forward, reverse, and string mapping to numarray
-                sctypeNA[na_name] = info.type
-                sctypeNA[info.type] = na_name
-                sctypeNA[info.char] = na_name
-        if char != '':
-            sctypeDict[char] = info.type
-            sctypeNA[char] = na_name
+        assert char != ''
+        sctypeDict[char] = info.type
+        sctypeNA[char] = na_name
 _add_aliases()
 
-# Integers are handled so that the int32 and int64 types should agree
-# exactly with NPY_INT32, NPY_INT64. We need to enforce the same checking
-# as is done in arrayobject.h where the order of getting a bit-width match
-# is long, longlong, int, short, char.
 def _add_integer_aliases():
-    _ctypes = ['LONG', 'LONGLONG', 'INT', 'SHORT', 'BYTE']
-    for ctype in _ctypes:
-        i_info = typeinfo[ctype]
-        u_info = typeinfo['U'+ctype]
+    seen_bits = set()
+    for i_ctype, u_ctype in zip(_int_ctypes, _uint_ctypes):
+        i_info = _concrete_typeinfo[i_ctype]
+        u_info = _concrete_typeinfo[u_ctype]
         bits = i_info.bits  # same for both
 
         for info, charname, intname, Intname in [
                 (i_info,'i%d' % (bits//8,), 'int%d' % bits, 'Int%d' % bits),
                 (u_info,'u%d' % (bits//8,), 'uint%d' % bits, 'UInt%d' % bits)]:
-            if intname not in allTypes.keys():
+            if bits not in seen_bits:
+                # sometimes two different types have the same number of bits
+                # if so, the one iterated over first takes precedence
                 allTypes[intname] = info.type
                 sctypeDict[intname] = info.type
                 sctypeDict[Intname] = info.type
@@ -356,6 +389,9 @@ def _add_integer_aliases():
                 sctypeNA[charname] = info.type
             sctypeNA[info.type] = Intname
             sctypeNA[info.char] = Intname
+
+        seen_bits.add(bits)
+
 _add_integer_aliases()
 
 # We use these later
@@ -383,16 +419,14 @@ def _set_up_aliases():
                   ('clongfloat', 'clongdouble'),
                   ('longcomplex', 'clongdouble'),
                   ('bool_', 'bool'),
+                  ('bytes_', 'string'),
+                  ('string_', 'string'),
                   ('unicode_', 'unicode'),
                   ('object_', 'object')]
     if sys.version_info[0] >= 3:
-        type_pairs.extend([('bytes_', 'string'),
-                           ('str_', 'unicode'),
-                           ('string_', 'string')])
+        type_pairs.extend([('str_', 'unicode')])
     else:
-        type_pairs.extend([('str_', 'string'),
-                           ('string_', 'string'),
-                           ('bytes_', 'string')])
+        type_pairs.extend([('str_', 'string')])
     for alias, t in type_pairs:
         allTypes[alias] = allTypes[t]
         sctypeDict[alias] = sctypeDict[t]
@@ -416,10 +450,9 @@ _set_up_aliases()
 # Now, construct dictionary to lookup character codes from types
 _sctype2char_dict = {}
 def _construct_char_code_lookup():
-    for name, info in typeinfo.items():
-        if not isinstance(info, type):
-            if info.char not in ['p', 'P']:
-                _sctype2char_dict[info.type] = info.char
+    for name, info in _concrete_typeinfo.items():
+        if info.char not in ['p', 'P']:
+            _sctype2char_dict[info.type] = info.char
 _construct_char_code_lookup()
 
 
@@ -764,9 +797,7 @@ _alignment = _typedict()
 _maxvals = _typedict()
 _minvals = _typedict()
 def _construct_lookups():
-    for name, info in typeinfo.items():
-        if isinstance(info, type):
-            continue
+    for name, info in _concrete_typeinfo.items():
         obj = info.type
         nbytes[obj] = info.bits // 8
         _alignment[obj] = info.alignment
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 70a43046c..356482b07 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -40,8 +40,8 @@ C_ABI_VERSION = 0x01000009
 # 0x0000000a - 1.12.x
 # 0x0000000b - 1.13.x
 # 0x0000000c - 1.14.x
-# 0x0000000d - 1.15.x
-C_API_VERSION = 0x0000000d
+# 0x0000000c - 1.15.x
+C_API_VERSION = 0x0000000c
 
 class MismatchCAPIWarning(Warning):
     pass
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index cba96a4c2..67c9a333c 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -1042,76 +1042,6 @@ test_nditer_too_large(PyObject *NPY_UNUSED(self), PyObject *args) {
 }
 
 static PyObject *
-test_nditer_writeback(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
-{
-    /* like npyiter_init */
-    PyObject *op_in = NULL, *op_dtypes_in = NULL, *value = NULL;
-    PyArrayObject * opview;
-    int iop, nop = 0;
-    PyArrayObject *op[NPY_MAXARGS];
-    npy_uint32 flags = 0;
-    NPY_ORDER order = NPY_KEEPORDER;
-    NPY_CASTING casting = NPY_EQUIV_CASTING;
-    npy_uint32 op_flags[NPY_MAXARGS];
-    PyArray_Descr *op_request_dtypes[NPY_MAXARGS];
-    int retval;
-    unsigned char do_close;
-    int buffersize = 0;
-    NpyIter *iter = NULL;
-    static char *kwlist[] = {"value", "do_close", "input", "op_dtypes", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds,
-                    "ObO|O:test_nditer_writeback", kwlist,
-                    &value,
-                    &do_close,
-                    &op_in,
-                    &op_dtypes_in)) {
-        return NULL;
-    }
-    /* op and op_flags */
-    if (! PyArray_Check(op_in)) {
-        return NULL;
-    }
-    nop = 1;
-    op[0] = (PyArrayObject*)op_in;
-    op_flags[0] = NPY_ITER_READWRITE|NPY_ITER_UPDATEIFCOPY;
-
-    /* Set the dtypes */
-    for (iop=0; iop<nop; iop++) {
-        PyObject *dtype = PySequence_GetItem(op_dtypes_in, iop);
-        PyArray_DescrConverter2(dtype, &op_request_dtypes[iop]);
-    }
-
-    iter = NpyIter_AdvancedNew(nop, op, flags, order, casting, op_flags,
-                                  op_request_dtypes,
-                                  -1, NULL, NULL,
-                                  buffersize);
-    if (iter == NULL) {
-        goto fail;
-    }
-
-    opview = NpyIter_GetIterView(iter, 0);
-    retval = PyArray_FillWithScalar(opview, value);
-    Py_DECREF(opview);
-    if (retval < 0) {
-        NpyIter_Deallocate(iter);
-        return NULL;
-    }
-    if (do_close != 0) {
-        NpyIter_Close(iter);
-    }
-    NpyIter_Deallocate(iter);
-    Py_RETURN_NONE;
-
-fail:
-    for (iop = 0; iop < nop; ++iop) {
-        Py_XDECREF(op[iop]);
-        Py_XDECREF(op_request_dtypes[iop]);
-    }
-    return NULL;
-}
-
-static PyObject *
 array_solve_diophantine(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
     PyObject *A = NULL;
@@ -1948,9 +1878,6 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"test_nditer_too_large",
         test_nditer_too_large,
         METH_VARARGS, NULL},
-    {"test_nditer_writeback",
-        (PyCFunction)test_nditer_writeback,
-        METH_VARARGS | METH_KEYWORDS, NULL},
     {"solve_diophantine",
         (PyCFunction)array_solve_diophantine,
         METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 943edc772..368f5ded7 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -482,8 +482,7 @@ array_dealloc(PyArrayObject *self)
         {
             char const * msg = "WRITEBACKIFCOPY detected in array_dealloc. "
                 " Required call to PyArray_ResolveWritebackIfCopy or "
-                "PyArray_DiscardWritebackIfCopy is missing. This could also "
-                "be caused by using a nditer without a context manager";
+                "PyArray_DiscardWritebackIfCopy is missing.";
             Py_INCREF(self); /* hold on to self in next call  since if
                               * refcount == 0 it will recurse back into
                               *array_dealloc
@@ -1250,7 +1249,8 @@ PyArray_ChainExceptionsCause(PyObject *exc, PyObject *val, PyObject *tb)
     }
 }
 
-/* Silence the current error and emit a deprecation warning instead.
+/*
+ * Silence the current error and emit a deprecation warning instead.
  *
  * If warnings are raised as errors, this sets the warning __cause__ to the
  * silenced error.
@@ -1269,6 +1269,118 @@ DEPRECATE_silence_error(const char *msg) {
     return 0;
 }
 
+/*
+ * Comparisons can fail, but we do not always want to pass on the exception
+ * (see comment in array_richcompare below), but rather return NotImplemented.
+ * Here, an exception should be set on entrance.
+ * Returns either NotImplemented with the exception cleared, or NULL
+ * with the exception set.
+ * Raises deprecation warnings for cases where behaviour is meant to change
+ * (2015-05-14, 1.10)
+ */
+
+NPY_NO_EXPORT PyObject *
+_failed_comparison_workaround(PyArrayObject *self, PyObject *other, int cmp_op)
+{
+    PyObject *exc, *val, *tb;
+    PyArrayObject *array_other;
+    int other_is_flexible, ndim_other;
+    int self_is_flexible = PyTypeNum_ISFLEXIBLE(PyArray_DESCR(self)->type_num);
+
+    PyErr_Fetch(&exc, &val, &tb);
+    /*
+     * Determine whether other has a flexible dtype; here, inconvertible
+     * is counted as inflexible.  (This repeats work done in the ufunc,
+     * but OK to waste some time in an unlikely path.)
+     */
+    array_other = (PyArrayObject *)PyArray_FROM_O(other);
+    if (array_other) {
+        other_is_flexible = PyTypeNum_ISFLEXIBLE(
+            PyArray_DESCR(array_other)->type_num);
+        ndim_other = PyArray_NDIM(array_other);
+        Py_DECREF(array_other);
+    }
+    else {
+        PyErr_Clear(); /* we restore the original error if needed */
+        other_is_flexible = 0;
+        ndim_other = 0;
+    }
+    if (cmp_op == Py_EQ || cmp_op == Py_NE) {
+        /*
+         * note: for == and !=, a structured dtype self cannot get here,
+         * but a string can. Other can be string or structured.
+         */
+        if (other_is_flexible || self_is_flexible) {
+            /*
+             * For scalars, returning NotImplemented is correct.
+             * For arrays, we emit a future deprecation warning.
+             * When this warning is removed, a correctly shaped
+             * array of bool should be returned.
+             */
+            if (ndim_other != 0 || PyArray_NDIM(self) != 0) {
+                /* 2015-05-14, 1.10 */
+                if (DEPRECATE_FUTUREWARNING(
+                        "elementwise comparison failed; returning scalar "
+                        "instead, but in the future will perform "
+                        "elementwise comparison") < 0) {
+                    goto fail;
+                }
+            }
+        }
+        else {
+            /*
+             * If neither self nor other had a flexible dtype, the error cannot
+             * have been caused by a lack of implementation in the ufunc.
+             *
+             * 2015-05-14, 1.10
+             */
+            if (DEPRECATE(
+                    "elementwise comparison failed; "
+                    "this will raise an error in the future.") < 0) {
+                goto fail;
+            }
+        }
+        Py_XDECREF(exc);
+        Py_XDECREF(val);
+        Py_XDECREF(tb);
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    else if (other_is_flexible || self_is_flexible) {
+        /*
+         * For LE, LT, GT, GE and a flexible self or other, we return
+         * NotImplemented, which is the correct answer since the ufuncs do
+         * not in fact implement loops for those.  On python 3 this will
+         * get us the desired TypeError, but on python 2, one gets strange
+         * ordering, so we emit a warning.
+         */
+#if !defined(NPY_PY3K)
+        /* 2015-05-14, 1.10 */
+        if (DEPRECATE(
+                "unorderable dtypes; returning scalar but in "
+                "the future this will be an error") < 0) {
+            goto fail;
+        }
+#endif
+        Py_XDECREF(exc);
+        Py_XDECREF(val);
+        Py_XDECREF(tb);
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    else {
+        /* LE, LT, GT, or GE with non-flexible other; just pass on error */
+        goto fail;
+    }
+
+fail:
+    /*
+     * Reraise the original exception, possibly chaining with a new one.
+     */
+    PyArray_ChainExceptionsCause(exc, val, tb);
+    return NULL;
+}
+
 NPY_NO_EXPORT PyObject *
 array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
 {
@@ -1366,26 +1478,6 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
         result = PyArray_GenericBinaryFunction(self,
                 (PyObject *)other,
                 n_ops.equal);
-        /*
-         * If the comparison results in NULL, then the
-         * two array objects can not be compared together;
-         * indicate that
-         */
-        if (result == NULL) {
-            /*
-             * Comparisons should raise errors when element-wise comparison
-             * is not possible.
-             */
-            /* 2015-05-14, 1.10 */
-            if (DEPRECATE_silence_error(
-                    "elementwise == comparison failed; "
-                    "this will raise an error in the future.") < 0) {
-                return NULL;
-            }
-
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
-        }
         break;
     case Py_NE:
         RICHCMP_GIVE_UP_IF_NEEDED(obj_self, other);
@@ -1437,21 +1529,6 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
 
         result = PyArray_GenericBinaryFunction(self, (PyObject *)other,
                 n_ops.not_equal);
-        if (result == NULL) {
-            /*
-             * Comparisons should raise errors when element-wise comparison
-             * is not possible.
-             */
-            /* 2015-05-14, 1.10 */
-            if (DEPRECATE_silence_error(
-                    "elementwise != comparison failed; "
-                    "this will raise an error in the future.") < 0) {
-                return NULL;
-            }
-
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
-        }
         break;
     case Py_GT:
         RICHCMP_GIVE_UP_IF_NEEDED(obj_self, other);
@@ -1464,8 +1541,37 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
                 n_ops.greater_equal);
         break;
     default:
-        result = Py_NotImplemented;
-        Py_INCREF(result);
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    if (result == NULL) {
+        /*
+         * 2015-05-14, 1.10; updated 2018-06-18, 1.16.
+         *
+         * Comparisons can raise errors when element-wise comparison is not
+         * possible. Some of these, though, should not be passed on.
+         * In particular, the ufuncs do not have loops for flexible dtype,
+         * so those should be treated separately.  Furthermore, for EQ and NE,
+         * we should never fail.
+         *
+         * Our ideal behaviour would be:
+         *
+         * 1. For EQ and NE:
+         *   - If self and other are scalars, return NotImplemented,
+         *     so that python can assign True of False as appropriate.
+         *   - If either is an array, return an array of False or True.
+         *
+         * 2. For LT, LE, GE, GT:
+         *   - If self or other was flexible, return NotImplemented
+         *     (as is in fact the case), so python can raise a TypeError.
+         *   - If other is not convertible to an array, pass on the error
+         *     (MHvK, 2018-06-18: not sure about this, but it's what we have).
+         *
+         * However, for backwards compatibilty, we cannot yet return arrays,
+         * so we raise warnings instead.  Furthermore, we warn on python2
+         * for LT, LE, GE, GT, since fall-back behaviour is poorly defined.
+         */
+        result = _failed_comparison_workaround(self, other, cmp_op);
     }
     return result;
 }
diff --git a/numpy/core/src/multiarray/cblasfuncs.c b/numpy/core/src/multiarray/cblasfuncs.c
index c941bb29b..6460c5db1 100644
--- a/numpy/core/src/multiarray/cblasfuncs.c
+++ b/numpy/core/src/multiarray/cblasfuncs.c
@@ -12,32 +12,6 @@
 #include "npy_cblas.h"
 #include "arraytypes.h"
 #include "common.h"
-#include "mem_overlap.h"
-
-
-/*
- * Helper: call appropriate BLAS dot function for typenum.
- * Strides are NumPy strides.
- */
-static void
-blas_dot(int typenum, npy_intp n,
-         void *a, npy_intp stridea, void *b, npy_intp strideb, void *res)
-{
-    switch (typenum) {
-        case NPY_DOUBLE:
-            DOUBLE_dot(a, stridea, b, strideb, res, n, NULL);
-            break;
-        case NPY_FLOAT:
-            FLOAT_dot(a, stridea, b, strideb, res, n, NULL);
-            break;
-        case NPY_CDOUBLE:
-            CDOUBLE_dot(a, stridea, b, strideb, res, n, NULL);
-            break;
-        case NPY_CFLOAT:
-            CFLOAT_dot(a, stridea, b, strideb, res, n, NULL);
-            break;
-    }
-}
 
 
 static const double oneD[2] = {1.0, 0.0}, zeroD[2] = {0.0, 0.0};
@@ -227,6 +201,7 @@ _bad_strides(PyArrayObject *ap)
     return 0;
 }
 
+
 /*
  * dot(a,b)
  * Returns the dot product of a and b for arrays of floating point types.
@@ -379,77 +354,9 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
         }
     }
 
-    if (out != NULL) {
-        int d;
-
-        /* verify that out is usable */
-        if (PyArray_NDIM(out) != nd ||
-            PyArray_TYPE(out) != typenum ||
-            !PyArray_ISCARRAY(out)) {
-
-            PyErr_SetString(PyExc_ValueError,
-                "output array is not acceptable (must have the right datatype, "
-                "number of dimensions, and be a C-Array)");
-            goto fail;
-        }
-        for (d = 0; d < nd; ++d) {
-            if (dimensions[d] != PyArray_DIM(out, d)) {
-                PyErr_SetString(PyExc_ValueError,
-                    "output array has wrong dimensions");
-                goto fail;
-            }
-        }
-
-        /* check for memory overlap */
-        if (!(solve_may_share_memory(out, ap1, 1) == 0 &&
-              solve_may_share_memory(out, ap2, 1) == 0)) {
-            /* allocate temporary output array */
-            out_buf = (PyArrayObject *)PyArray_NewLikeArray(out, NPY_CORDER,
-                                                            NULL, 0);
-            if (out_buf == NULL) {
-                goto fail;
-            }
-
-            /* set copy-back */
-            Py_INCREF(out);
-            if (PyArray_SetWritebackIfCopyBase(out_buf, out) < 0) {
-                Py_DECREF(out);
-                goto fail;
-            }
-        }
-        else {
-            Py_INCREF(out);
-            out_buf = out;
-        }
-        Py_INCREF(out);
-        result = out;
-    }
-    else {
-        double prior1, prior2;
-        PyTypeObject *subtype;
-        PyObject *tmp;
-
-        /* Choose which subtype to return */
-        if (Py_TYPE(ap1) != Py_TYPE(ap2)) {
-            prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
-            prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
-            subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
-        }
-        else {
-            prior1 = prior2 = 0.0;
-            subtype = Py_TYPE(ap1);
-        }
-
-        tmp = (PyObject *)(prior2 > prior1 ? ap2 : ap1);
-
-        out_buf = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
-                                               typenum, NULL, NULL, 0, 0, tmp);
-        if (out_buf == NULL) {
-            goto fail;
-        }
-
-        Py_INCREF(out_buf);
-        result = out_buf;
+    out_buf = new_array_for_sum(ap1, ap2, out, nd, dimensions, typenum, &result);
+    if (out_buf == NULL) {
+        goto fail;
     }
 
     numbytes = PyArray_NBYTES(out_buf);
@@ -617,10 +524,10 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
         NPY_BEGIN_ALLOW_THREADS;
 
         /* Dot product between two vectors -- Level 1 BLAS */
-        blas_dot(typenum, l,
+        PyArray_DESCR(out_buf)->f->dotfunc(
                  PyArray_DATA(ap1), PyArray_STRIDE(ap1, (ap1shape == _row)),
                  PyArray_DATA(ap2), PyArray_STRIDE(ap2, 0),
-                 PyArray_DATA(out_buf));
+                 PyArray_DATA(out_buf), l, NULL);
         NPY_END_ALLOW_THREADS;
     }
     else if (ap1shape == _matrix && ap2shape != _matrix) {
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index f191f8db4..4f695fdc7 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -15,6 +15,7 @@
 #include "buffer.h"
 
 #include "get_attr_string.h"
+#include "mem_overlap.h"
 
 /*
  * The casting to use for implicit assignment operations resulting from
@@ -424,7 +425,7 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
      * __len__ is not defined.
      */
     if (maxdims == 0 || !PySequence_Check(obj) || PySequence_Size(obj) < 0) {
-        // clear any PySequence_Size error, which corrupts further calls to it
+        /* clear any PySequence_Size error which corrupts further calls */
         PyErr_Clear();
 
         if (*out_dtype == NULL || (*out_dtype)->type_num != NPY_OBJECT) {
@@ -852,3 +853,102 @@ _may_have_objects(PyArray_Descr *dtype)
     return (PyDataType_HASFIELDS(base) ||
             PyDataType_FLAGCHK(base, NPY_ITEM_HASOBJECT) );
 }
+
+/*
+ * Make a new empty array, of the passed size, of a type that takes the
+ * priority of ap1 and ap2 into account.
+ *
+ * If `out` is non-NULL, memory overlap is checked with ap1 and ap2, and an
+ * updateifcopy temporary array may be returned. If `result` is non-NULL, the
+ * output array to be returned (`out` if non-NULL and the newly allocated array
+ * otherwise) is incref'd and put to *result.
+ */
+NPY_NO_EXPORT PyArrayObject *
+new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
+                  int nd, npy_intp dimensions[], int typenum, PyArrayObject **result)
+{
+    PyArrayObject *out_buf;
+
+    if (out) {
+        int d;
+
+        /* verify that out is usable */
+        if (PyArray_NDIM(out) != nd ||
+            PyArray_TYPE(out) != typenum ||
+            !PyArray_ISCARRAY(out)) {
+            PyErr_SetString(PyExc_ValueError,
+                "output array is not acceptable (must have the right datatype, "
+                "number of dimensions, and be a C-Array)");
+            return 0;
+        }
+        for (d = 0; d < nd; ++d) {
+            if (dimensions[d] != PyArray_DIM(out, d)) {
+                PyErr_SetString(PyExc_ValueError,
+                    "output array has wrong dimensions");
+                return 0;
+            }
+        }
+
+        /* check for memory overlap */
+        if (!(solve_may_share_memory(out, ap1, 1) == 0 &&
+              solve_may_share_memory(out, ap2, 1) == 0)) {
+            /* allocate temporary output array */
+            out_buf = (PyArrayObject *)PyArray_NewLikeArray(out, NPY_CORDER,
+                                                            NULL, 0);
+            if (out_buf == NULL) {
+                return NULL;
+            }
+
+            /* set copy-back */
+            Py_INCREF(out);
+            if (PyArray_SetWritebackIfCopyBase(out_buf, out) < 0) {
+                Py_DECREF(out);
+                Py_DECREF(out_buf);
+                return NULL;
+            }
+        }
+        else {
+            Py_INCREF(out);
+            out_buf = out;
+        }
+
+        if (result) {
+            Py_INCREF(out);
+            *result = out;
+        }
+
+        return out_buf;
+    }
+    else {
+        PyTypeObject *subtype;
+        double prior1, prior2;
+        /*
+         * Need to choose an output array that can hold a sum
+         * -- use priority to determine which subtype.
+         */
+        if (Py_TYPE(ap2) != Py_TYPE(ap1)) {
+            prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
+            prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
+            subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
+        }
+        else {
+            prior1 = prior2 = 0.0;
+            subtype = Py_TYPE(ap1);
+        }
+
+        out_buf = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
+                                               typenum, NULL, NULL, 0, 0,
+                                               (PyObject *)
+                                               (prior2 > prior1 ? ap2 : ap1));
+
+        if (out_buf != NULL && result) {
+            Py_INCREF(out_buf);
+            *result = out_buf;
+        }
+
+        return out_buf;
+    }
+}
+
+
+
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index ae9b960c8..db0a49920 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -283,4 +283,17 @@ blas_stride(npy_intp stride, unsigned itemsize)
 
 #include "ucsnarrow.h"
 
+/*
+ * Make a new empty array, of the passed size, of a type that takes the
+ * priority of ap1 and ap2 into account.
+ *
+ * If `out` is non-NULL, memory overlap is checked with ap1 and ap2, and an
+ * updateifcopy temporary array may be returned. If `result` is non-NULL, the
+ * output array to be returned (`out` if non-NULL and the newly allocated array
+ * otherwise) is incref'd and put to *result.
+ */
+NPY_NO_EXPORT PyArrayObject *
+new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
+                  int nd, npy_intp dimensions[], int typenum, PyArrayObject **result);
+
 #endif
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index bcb44f6d1..1c27f8394 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -21,11 +21,17 @@
  * and 0 if the array is not monotonic.
  */
 static int
-check_array_monotonic(const double *a, npy_int lena)
+check_array_monotonic(const double *a, npy_intp lena)
 {
     npy_intp i;
     double next;
-    double last = a[0];
+    double last;
+
+    if (lena == 0) {
+        /* all bin edges hold the same value */
+        return 1;
+    }
+    last = a[0];
 
     /* Skip repeated values at the beginning of the array */
     for (i = 1; (i < lena) && (a[i] == last); i++);
@@ -209,106 +215,41 @@ fail:
     return NULL;
 }
 
-/*
- * digitize(x, bins, right=False) returns an array of integers the same length
- * as x. The values i returned are such that bins[i - 1] <= x < bins[i] if
- * bins is monotonically increasing, or bins[i - 1] > x >= bins[i] if bins
- * is monotonically decreasing.  Beyond the bounds of bins, returns either
- * i = 0 or i = len(bins) as appropriate. If right == True the comparison
- * is bins [i - 1] < x <= bins[i] or bins [i - 1] >= x > bins[i]
- */
+/* Internal function to expose check_array_monotonic to python */
 NPY_NO_EXPORT PyObject *
-arr_digitize(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+arr__monotonicity(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 {
+    static char *kwlist[] = {"x", NULL};
     PyObject *obj_x = NULL;
-    PyObject *obj_bins = NULL;
     PyArrayObject *arr_x = NULL;
-    PyArrayObject *arr_bins = NULL;
-    PyObject *ret = NULL;
-    npy_intp len_bins;
-    int monotonic, right = 0;
-    NPY_BEGIN_THREADS_DEF
-
-    static char *kwlist[] = {"x", "bins", "right", NULL};
+    long monotonic;
+    npy_intp len_x;
+    NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|i:digitize", kwlist,
-                                     &obj_x, &obj_bins, &right)) {
-        goto fail;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|_monotonicity", kwlist,
+                                     &obj_x)) {
+        return NULL;
     }
 
-    /* PyArray_SearchSorted will make `x` contiguous even if we don't */
-    arr_x = (PyArrayObject *)PyArray_FROMANY(obj_x, NPY_DOUBLE, 0, 0,
-                                             NPY_ARRAY_CARRAY_RO);
+    /*
+     * TODO:
+     *  `x` could be strided, needs change to check_array_monotonic
+     *  `x` is forced to double for this check
+     */
+    arr_x = (PyArrayObject *)PyArray_FROMANY(
+        obj_x, NPY_DOUBLE, 1, 1, NPY_ARRAY_CARRAY_RO);
     if (arr_x == NULL) {
-        goto fail;
-    }
-
-    /* TODO: `bins` could be strided, needs change to check_array_monotonic */
-    arr_bins = (PyArrayObject *)PyArray_FROMANY(obj_bins, NPY_DOUBLE, 1, 1,
-                                               NPY_ARRAY_CARRAY_RO);
-    if (arr_bins == NULL) {
-        goto fail;
-    }
-
-    len_bins = PyArray_SIZE(arr_bins);
-    if (len_bins == 0) {
-        PyErr_SetString(PyExc_ValueError, "bins must have non-zero length");
-        goto fail;
+        return NULL;
     }
 
-    NPY_BEGIN_THREADS_THRESHOLDED(len_bins)
-    monotonic = check_array_monotonic((const double *)PyArray_DATA(arr_bins),
-                                      len_bins);
+    len_x = PyArray_SIZE(arr_x);
+    NPY_BEGIN_THREADS_THRESHOLDED(len_x)
+    monotonic = check_array_monotonic(
+        (const double *)PyArray_DATA(arr_x), len_x);
     NPY_END_THREADS
+    Py_DECREF(arr_x);
 
-    if (monotonic == 0) {
-        PyErr_SetString(PyExc_ValueError,
-                        "bins must be monotonically increasing or decreasing");
-        goto fail;
-    }
-
-    /* PyArray_SearchSorted needs an increasing array */
-    if (monotonic == - 1) {
-        PyArrayObject *arr_tmp = NULL;
-        npy_intp shape = PyArray_DIM(arr_bins, 0);
-        npy_intp stride = -PyArray_STRIDE(arr_bins, 0);
-        void *data = (void *)(PyArray_BYTES(arr_bins) - stride * (shape - 1));
-
-        arr_tmp = (PyArrayObject *)PyArray_NewFromDescrAndBase(
-                &PyArray_Type, PyArray_DescrFromType(NPY_DOUBLE),
-                1, &shape, &stride, data,
-                PyArray_FLAGS(arr_bins), NULL, (PyObject *)arr_bins);
-        Py_DECREF(arr_bins);
-        if (!arr_tmp) {
-            goto fail;
-        }
-        arr_bins = arr_tmp;
-    }
-
-    ret = PyArray_SearchSorted(arr_bins, (PyObject *)arr_x,
-                               right ? NPY_SEARCHLEFT : NPY_SEARCHRIGHT, NULL);
-    if (!ret) {
-        goto fail;
-    }
-
-    /* If bins is decreasing, ret has bins from end, not start */
-    if (monotonic == -1) {
-        npy_intp *ret_data =
-                        (npy_intp *)PyArray_DATA((PyArrayObject *)ret);
-        npy_intp len_ret = PyArray_SIZE((PyArrayObject *)ret);
-
-        NPY_BEGIN_THREADS_THRESHOLDED(len_ret)
-        while (len_ret--) {
-            *ret_data = len_bins - *ret_data;
-            ret_data++;
-        }
-        NPY_END_THREADS
-    }
-
-    fail:
-        Py_XDECREF(arr_x);
-        Py_XDECREF(arr_bins);
-        return ret;
+    return PyInt_FromLong(monotonic);
 }
 
 /*
@@ -654,6 +595,10 @@ arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
             else if (j == lenxp - 1) {
                 dres[i] = dy[j];
             }
+            else if (dx[j] == x_val) {
+                /* Avoid potential non-finite interpolation */
+                dres[i] = dy[j];
+            }
             else {
                 const npy_double slope = (slopes != NULL) ? slopes[j] :
                                          (dy[j+1] - dy[j]) / (dx[j+1] - dx[j]);
@@ -822,6 +767,10 @@ arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
             else if (j == lenxp - 1) {
                 dres[i] = dy[j];
             }
+            else if (dx[j] == x_val) {
+                /* Avoid potential non-finite interpolation */
+                dres[i] = dy[j];
+            }
             else {
                 if (slopes!=NULL) {
                     dres[i].real = slopes[j].real*(x_val - dx[j]) + dy[j].real;
diff --git a/numpy/core/src/multiarray/compiled_base.h b/numpy/core/src/multiarray/compiled_base.h
index 51508531c..082139910 100644
--- a/numpy/core/src/multiarray/compiled_base.h
+++ b/numpy/core/src/multiarray/compiled_base.h
@@ -7,7 +7,7 @@ arr_insert(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr_bincount(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_digitize(PyObject *, PyObject *, PyObject *kwds);
+arr__monotonicity(PyObject *, PyObject *, PyObject *kwds);
 NPY_NO_EXPORT PyObject *
 arr_interp(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 0e38aaa61..e88582a51 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -613,11 +613,14 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
         subtype = Py_TYPE(self);
     }
 
-    if (type != NULL && (PyArray_FLAGS(self) & NPY_ARRAY_WARN_ON_WRITE)) {
+    dtype = PyArray_DESCR(self);
+
+    if (type != NULL && !PyArray_EquivTypes(dtype, type) &&
+            (PyArray_FLAGS(self) & NPY_ARRAY_WARN_ON_WRITE)) {
         const char *msg =
             "Numpy has detected that you may be viewing or writing to an array "
             "returned by selecting multiple fields in a structured array. \n\n"
-            "This code may break in numpy 1.13 because this will return a view "
+            "This code may break in numpy 1.16 because this will return a view "
             "instead of a copy -- see release notes for details.";
         /* 2016-09-19, 1.12 */
         if (DEPRECATE_FUTUREWARNING(msg) < 0) {
@@ -629,7 +632,6 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
 
     flags = PyArray_FLAGS(self);
 
-    dtype = PyArray_DESCR(self);
     Py_INCREF(dtype);
     ret = (PyArrayObject *)PyArray_NewFromDescr_int(
             subtype, dtype,
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 7367902cc..938850997 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -666,7 +666,6 @@ discover_dimensions(PyObject *obj, int *maxndim, npy_intp *d, int check_it,
                                     int *out_is_object)
 {
     PyObject *e;
-    int r;
     npy_intp n, i;
     Py_buffer buffer_view;
     PyObject * seq;
@@ -846,46 +845,48 @@ discover_dimensions(PyObject *obj, int *maxndim, npy_intp *d, int check_it,
         return 0;
     }
     else {
-        npy_intp dtmp[NPY_MAXDIMS];
-        int j, maxndim_m1 = *maxndim - 1;
-        e = PySequence_Fast_GET_ITEM(seq, 0);
-
-        r = discover_dimensions(e, &maxndim_m1, d + 1, check_it,
-                                        stop_at_string, stop_at_tuple,
-                                        out_is_object);
-        if (r < 0) {
+        int all_elems_maxndim = *maxndim - 1;
+        npy_intp *all_elems_d = d + 1;
+        int all_dimensions_match = 1;
+
+        /* Get the dimensions of the first item as a baseline */
+        PyObject *first = PySequence_Fast_GET_ITEM(seq, 0);
+        if (discover_dimensions(
+                first, &all_elems_maxndim, all_elems_d, check_it,
+                stop_at_string, stop_at_tuple, out_is_object) < 0) {
             Py_DECREF(seq);
-            return r;
+            return -1;
         }
 
-        /* For the dimension truncation check below */
-        *maxndim = maxndim_m1 + 1;
+        /* Compare the dimensions of all the remaining items */
         for (i = 1; i < n; ++i) {
-            e = PySequence_Fast_GET_ITEM(seq, i);
-            /* Get the dimensions of the first item */
-            r = discover_dimensions(e, &maxndim_m1, dtmp, check_it,
-                                            stop_at_string, stop_at_tuple,
-                                            out_is_object);
-            if (r < 0) {
+            int j;
+            int elem_maxndim = *maxndim - 1;
+            npy_intp elem_d[NPY_MAXDIMS];
+
+            PyObject *elem = PySequence_Fast_GET_ITEM(seq, i);
+            if (discover_dimensions(
+                    elem, &elem_maxndim, elem_d, check_it,
+                    stop_at_string, stop_at_tuple, out_is_object) < 0) {
                 Py_DECREF(seq);
-                return r;
+                return -1;
             }
 
-            /* Reduce max_ndim_m1 to just items which match */
-            for (j = 0; j < maxndim_m1; ++j) {
-                if (dtmp[j] != d[j+1]) {
-                    maxndim_m1 = j;
+            /* Find the number of left-dimensions which match, j */
+            for (j = 0; j < elem_maxndim && j < all_elems_maxndim; ++j) {
+                if (elem_d[j] != all_elems_d[j]) {
                     break;
                 }
             }
+            if (j != elem_maxndim || j != all_elems_maxndim) {
+                all_dimensions_match = 0;
+            }
+            all_elems_maxndim = j;
         }
-        /*
-         * If the dimensions are truncated, need to produce
-         * an object array.
-         */
-        if (maxndim_m1 + 1 < *maxndim) {
+        *maxndim = all_elems_maxndim + 1;
+        if (!all_dimensions_match) {
+            /* typically results in an array containing variable-length lists */
             *out_is_object = 1;
-            *maxndim = maxndim_m1 + 1;
         }
     }
 
@@ -1704,9 +1705,9 @@ PyArray_GetArrayParamsFromObject(PyObject *op,
 
         *out_ndim = NPY_MAXDIMS;
         is_object = 0;
-        if (discover_dimensions(op, out_ndim, out_dims, check_it,
-                                    stop_at_string, stop_at_tuple,
-                                    &is_object) < 0) {
+        if (discover_dimensions(
+                op, out_ndim, out_dims, check_it,
+                stop_at_string, stop_at_tuple, &is_object) < 0) {
             Py_DECREF(*out_dtype);
             if (PyErr_Occurred()) {
                 return -1;
diff --git a/numpy/core/src/multiarray/datetime_strings.c b/numpy/core/src/multiarray/datetime_strings.c
index 4f9d8fa41..95b7bb3dc 100644
--- a/numpy/core/src/multiarray/datetime_strings.c
+++ b/numpy/core/src/multiarray/datetime_strings.c
@@ -69,7 +69,7 @@
  * multiplatform code, get_localtime() should never be used outside of this
  * range.
  *
- * [1] http://en.wikipedia.org/wiki/Year_2038_problem
+ * [1] https://en.wikipedia.org/wiki/Year_2038_problem
  */
 static int
 get_localtime(NPY_TIME_T *ts, struct tm *tms)
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index bb3cc9d4e..a0dc98f0e 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -83,7 +83,7 @@ _arraydescr_fromctypes(PyObject *obj)
                 /* derived type */
                 PyObject *newtup;
                 PyArray_Descr *derived;
-                newtup = Py_BuildValue("NN", newdescr, length);
+                newtup = Py_BuildValue("N(N)", newdescr, length);
                 ret = PyArray_DescrConverter(newtup, &derived);
                 Py_DECREF(newtup);
                 if (ret == NPY_SUCCEED) {
diff --git a/numpy/core/src/multiarray/dragon4.c b/numpy/core/src/multiarray/dragon4.c
index c14653ac5..abbf05220 100644
--- a/numpy/core/src/multiarray/dragon4.c
+++ b/numpy/core/src/multiarray/dragon4.c
@@ -2698,7 +2698,7 @@ Dragon4_PrintFloat_Intel_extended128(
 }
 #endif /* HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE */
 
-#if defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_LE) || defined(HAVE_LDOUBLE_IEEE_QUAD_BE)
 /*
  * IEEE binary128 floating-point format
  *
@@ -2707,18 +2707,14 @@ Dragon4_PrintFloat_Intel_extended128(
  * mantissa: 112 bits
  *
  * Currently binary128 format exists on only a few CPUs, such as on the POWER9
- * arch. Because of this, this code has not been tested. I am not sure if the
- * arch also supports uint128, and C does not seem to support int128 literals.
- * So we use uint64 to do manipulation. Unfortunately this means we are endian
- * dependent. Assume little-endian for now, can fix later once binary128
- * becomes more common.
+ * arch or aarch64. Because of this, this code has not been extensively tested.
+ * I am not sure if the arch also supports uint128, and C does not seem to
+ * support int128 literals. So we use uint64 to do manipulation.
  */
 static npy_uint32
 Dragon4_PrintFloat_IEEE_binary128(
-    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+    Dragon4_Scratch *scratch, FloatVal128 val128, Dragon4_Options *opt)
 {
-    FloatUnion128 buf128;
-
     char *buffer = scratch->repr;
     npy_uint32 bufferSize = sizeof(scratch->repr);
     BigInt *bigints = scratch->bigints;
@@ -2731,8 +2727,6 @@ Dragon4_PrintFloat_IEEE_binary128(
     npy_bool hasUnequalMargins;
     char signbit = '\0';
 
-    buf128.floatingPoint = *value;
-
     if (bufferSize == 0) {
         return 0;
     }
@@ -2742,11 +2736,10 @@ Dragon4_PrintFloat_IEEE_binary128(
         return 0;
     }
 
-    /* Assumes little-endian !!! */
-    mantissa_hi = buf128.integer.a & bitmask_u64(48);
-    mantissa_lo = buf128.integer.b;
-    floatExponent = (buf128.integer.a >> 48) & bitmask_u32(15);
-    floatSign = buf128.integer.a >> 63;
+    mantissa_hi = val128.hi & bitmask_u64(48);
+    mantissa_lo = val128.lo;
+    floatExponent = (val128.hi >> 48) & bitmask_u32(15);
+    floatSign = val128.hi >> 63;
 
     /* output the sign */
     if (floatSign != 0) {
@@ -2810,8 +2803,45 @@ Dragon4_PrintFloat_IEEE_binary128(
     return Format_floatbits(buffer, bufferSize, bigints, exponent,
                             signbit, mantissaBit, hasUnequalMargins, opt);
 }
+
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
+static npy_uint32
+Dragon4_PrintFloat_IEEE_binary128_le(
+    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
+    FloatUnion128 buf128;
+
+    buf128.floatingPoint = *value;
+    val128.lo = buf128.integer.a;
+    val128.hi = buf128.integer.b;
+
+    return Dragon4_PrintFloat_IEEE_binary128(scratch, val128, opt);
+}
 #endif /* HAVE_LDOUBLE_IEEE_QUAD_LE */
 
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_BE)
+/*
+ * This function is untested, very few, if any, architectures implement
+ * big endian IEEE binary128 floating point.
+ */
+static npy_uint32
+Dragon4_PrintFloat_IEEE_binary128_be(
+    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
+    FloatUnion128 buf128;
+
+    buf128.floatingPoint = *value;
+    val128.lo = buf128.integer.b;
+    val128.hi = buf128.integer.a;
+
+    return Dragon4_PrintFloat_IEEE_binary128(scratch, val128, opt);
+}
+#endif /* HAVE_LDOUBLE_IEEE_QUAD_BE */
+
+#endif /* HAVE_LDOUBLE_IEEE_QUAD_LE | HAVE_LDOUBLE_IEEE_BE*/
+
 #if (defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE) || \
      defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE))
 /*
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 69833bee6..1765982a0 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -2499,7 +2499,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
 
     int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
     int *op_axes[NPY_MAXARGS];
-    npy_uint32 op_flags[NPY_MAXARGS];
+    npy_uint32 iter_flags, op_flags[NPY_MAXARGS];
 
     NpyIter *iter;
     sum_of_products_fn sop;
@@ -2745,29 +2745,33 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
                     NPY_ITER_ALIGNED|
                     NPY_ITER_ALLOCATE|
                     NPY_ITER_NO_BROADCAST;
+    iter_flags = NPY_ITER_EXTERNAL_LOOP|
+            NPY_ITER_BUFFERED|
+            NPY_ITER_DELAY_BUFALLOC|
+            NPY_ITER_GROWINNER|
+            NPY_ITER_REDUCE_OK|
+            NPY_ITER_REFS_OK|
+            NPY_ITER_ZEROSIZE_OK;
+    if (out != NULL) {
+        iter_flags |= NPY_ITER_COPY_IF_OVERLAP;
+    }
+    if (dtype == NULL) {
+        iter_flags |= NPY_ITER_COMMON_DTYPE;
+    }
 
     /* Allocate the iterator */
-    iter = NpyIter_AdvancedNew(nop+1, op, NPY_ITER_EXTERNAL_LOOP|
-                ((dtype != NULL) ? 0 : NPY_ITER_COMMON_DTYPE)|
-                                       NPY_ITER_BUFFERED|
-                                       NPY_ITER_DELAY_BUFALLOC|
-                                       NPY_ITER_GROWINNER|
-                                       NPY_ITER_REDUCE_OK|
-                                       NPY_ITER_REFS_OK|
-                                       NPY_ITER_ZEROSIZE_OK,
-                                       order, casting,
-                                       op_flags, op_dtypes,
-                                       ndim_iter, op_axes, NULL, 0);
+    iter = NpyIter_AdvancedNew(nop+1, op, iter_flags, order, casting, op_flags,
+                               op_dtypes, ndim_iter, op_axes, NULL, 0);
 
     if (iter == NULL) {
         goto fail;
     }
 
-    /* Initialize the output to all zeros and reset the iterator */
+    /* Initialize the output to all zeros */
     ret = NpyIter_GetOperandArray(iter)[nop];
-    Py_INCREF(ret);
-    PyArray_AssignZero(ret, NULL);
-
+    if (PyArray_AssignZero(ret, NULL) < 0) {
+        goto fail;
+    }
 
     /***************************/
     /*
@@ -2781,16 +2785,12 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
             case 1:
                 if (ndim == 2) {
                     if (unbuffered_loop_nop1_ndim2(iter) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
                         goto fail;
                     }
                     goto finish;
                 }
                 else if (ndim == 3) {
                     if (unbuffered_loop_nop1_ndim3(iter) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
                         goto fail;
                     }
                     goto finish;
@@ -2799,16 +2799,12 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
             case 2:
                 if (ndim == 2) {
                     if (unbuffered_loop_nop2_ndim2(iter) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
                         goto fail;
                     }
                     goto finish;
                 }
                 else if (ndim == 3) {
                     if (unbuffered_loop_nop2_ndim3(iter) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
                         goto fail;
                     }
                     goto finish;
@@ -2819,7 +2815,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     /***************************/
 
     if (NpyIter_Reset(iter, NULL) != NPY_SUCCEED) {
-        Py_DECREF(ret);
         goto fail;
     }
 
@@ -2841,8 +2836,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     if (sop == NULL) {
         PyErr_SetString(PyExc_TypeError,
                     "invalid data type for einsum");
-        Py_DECREF(ret);
-        ret = NULL;
     }
     else if (NpyIter_GetIterSize(iter) != 0) {
         NpyIter_IterNextFunc *iternext;
@@ -2854,7 +2847,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         iternext = NpyIter_GetIterNext(iter, NULL);
         if (iternext == NULL) {
             NpyIter_Deallocate(iter);
-            Py_DECREF(ret);
             goto fail;
         }
         dataptr = NpyIter_GetDataPtrArray(iter);
@@ -2870,12 +2862,16 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
 
         /* If the API was needed, it may have thrown an error */
         if (NpyIter_IterationNeedsAPI(iter) && PyErr_Occurred()) {
-            Py_DECREF(ret);
-            ret = NULL;
+            goto fail;
         }
     }
 
 finish:
+    if (out != NULL) {
+        ret = out;
+    }
+    Py_INCREF(ret);
+
     NpyIter_Deallocate(iter);
     for (iop = 0; iop < nop; ++iop) {
         Py_DECREF(op[iop]);
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index fa68af19a..b25b4a8b6 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -1371,7 +1371,7 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
  */
 
 /*
- * Advanded indexing iteration of arrays when there is a single indexing
+ * Advanced indexing iteration of arrays when there is a single indexing
  * array which has the same memory order as the value array and both
  * can be trivially iterated (single stride, aligned, no casting necessary).
  */
@@ -1405,7 +1405,7 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
     /* Check the indices beforehand */
     while (itersize--) {
         npy_intp indval = *((npy_intp*)ind_ptr);
-        if (check_and_adjust_index(&indval, fancy_dim, 1, _save) < 0 ) {
+        if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
             return -1;
         }
         ind_ptr += ind_stride;
@@ -1437,7 +1437,7 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
             npy_intp indval = *((npy_intp*)ind_ptr);
             assert(npy_is_aligned(ind_ptr, _ALIGN(npy_intp)));
 #if @isget@
-            if (check_and_adjust_index(&indval, fancy_dim, 1, _save) < 0 ) {
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
                 return -1;
             }
 #else
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 6a7ffd39d..f338226c2 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -233,7 +233,8 @@ unpack_indices(PyObject *index, PyObject **result, npy_intp result_n)
             || index == Py_None
             || PySlice_Check(index)
             || PyArray_Check(index)
-            || !PySequence_Check(index)) {
+            || !PySequence_Check(index)
+            || PyBaseString_Check(index)) {
 
         return unpack_scalar(index, result, result_n);
     }
@@ -1387,14 +1388,54 @@ array_subscript_asarray(PyArrayObject *self, PyObject *op)
 }
 
 /*
+ * Helper function for _get_field_view which turns a multifield
+ * view into a "packed" copy, as done in numpy 1.15 and before.
+ * In numpy 1.16 this function should be removed.
+ */
+NPY_NO_EXPORT int
+_multifield_view_to_copy(PyArrayObject **view) {
+    static PyObject *copyfunc = NULL;
+    PyObject *viewcopy;
+
+    /* return a repacked copy of the view */
+    npy_cache_import("numpy.lib.recfunctions", "repack_fields", &copyfunc);
+    if (copyfunc == NULL) {
+        goto view_fail;
+    }
+
+    PyArray_CLEARFLAGS(*view, NPY_ARRAY_WARN_ON_WRITE);
+    viewcopy = PyObject_CallFunction(copyfunc, "O", *view);
+    if (viewcopy == NULL) {
+        goto view_fail;
+    }
+    Py_DECREF(*view);
+    *view = (PyArrayObject*)viewcopy;
+
+    /* warn when writing to the copy */
+    PyArray_ENABLEFLAGS(*view, NPY_ARRAY_WARN_ON_WRITE);
+    return 0;
+
+view_fail:
+    Py_DECREF(*view);
+    *view = NULL;
+    return 0;
+}
+
+/*
  * Attempts to subscript an array using a field name or list of field names.
  *
  * If an error occurred, return 0 and set view to NULL. If the subscript is not
  * a string or list of strings, return -1 and set view to NULL. Otherwise
  * return 0 and set view to point to a new view into arr for the given fields.
+ *
+ * In numpy 1.15 and before, in the case of a list of field names the returned
+ * view will actually be a copy by default, with fields packed together.
+ * The `force_view` argument causes a view to be returned. This argument can be
+ * removed in 1.16 when we plan to return a view always.
  */
 NPY_NO_EXPORT int
-_get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
+_get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view,
+                int force_view)
 {
     *view = NULL;
 
@@ -1489,25 +1530,23 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 Py_DECREF(names);
                 return 0;
             }
-            // disallow use of titles as index
+            /* disallow use of titles as index */
             if (PyTuple_Size(tup) == 3) {
                 PyObject *title = PyTuple_GET_ITEM(tup, 2);
                 int titlecmp = PyObject_RichCompareBool(title, name, Py_EQ);
                 if (titlecmp == 1) {
-                    // if title == name, we were given a title, not a field name
+                    /* if title == name, we got a title, not a field name */
                     PyErr_SetString(PyExc_KeyError,
                                 "cannot use field titles in multi-field index");
                 }
                 if (titlecmp != 0 || PyDict_SetItem(fields, title, tup) < 0) {
-                    Py_DECREF(title);
                     Py_DECREF(name);
                     Py_DECREF(fields);
                     Py_DECREF(names);
                     return 0;
                 }
-                Py_DECREF(title);
             }
-            // disallow duplicate field indices
+            /* disallow duplicate field indices */
             if (PyDict_Contains(fields, name)) {
                 PyObject *errmsg = PyUString_FromString(
                                        "duplicate field of name ");
@@ -1552,10 +1591,16 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyArray_FLAGS(arr),
                 (PyObject *)arr, (PyObject *)arr,
                 0, 1);
+
         if (*view == NULL) {
             return 0;
         }
-        return 0;
+
+        /* the code below can be replaced by "return 0" in 1.16 */
+        if (force_view) {
+            return 0;
+        }
+        return _multifield_view_to_copy(view);
     }
     return -1;
 }
@@ -1583,7 +1628,7 @@ array_subscript(PyArrayObject *self, PyObject *op)
     /* return fields if op is a string index */
     if (PyDataType_HASFIELDS(PyArray_DESCR(self))) {
         PyArrayObject *view;
-        int ret = _get_field_view(self, op, &view);
+        int ret = _get_field_view(self, op, &view, 0);
         if (ret == 0){
             if (view == NULL) {
                 return NULL;
@@ -1865,7 +1910,7 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
     /* field access */
     if (PyDataType_HASFIELDS(PyArray_DESCR(self))){
         PyArrayObject *view;
-        int ret = _get_field_view(self, ind, &view);
+        int ret = _get_field_view(self, ind, &view, 1);
         if (ret == 0){
             if (view == NULL) {
                 return -1;
@@ -2037,7 +2082,7 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
                                                PyArray_TRIVIALLY_ITERABLE_OP_READ,
                                                PyArray_TRIVIALLY_ITERABLE_OP_READ) ||
                  (PyArray_NDIM(tmp_arr) == 0 &&
-                        PyArray_TRIVIALLY_ITERABLE(tmp_arr))) &&
+                        PyArray_TRIVIALLY_ITERABLE(ind))) &&
                 /* Check if the type is equivalent to INTP */
                 PyArray_ITEMSIZE(ind) == sizeof(npy_intp) &&
                 PyArray_DESCR(ind)->kind == 'i' &&
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index d6f2577a3..2e836d1d0 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -976,9 +976,12 @@ array_ufunc(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
     PyObject *ufunc, *method_name, *normal_args, *ufunc_method;
     PyObject *result = NULL;
-    int num_override_args;
+    int has_override;
 
-    if (PyTuple_Size(args) < 2) {
+    assert(PyTuple_CheckExact(args));
+    assert(kwds == NULL || PyDict_CheckExact(kwds));
+
+    if (PyTuple_GET_SIZE(args) < 2) {
         PyErr_SetString(PyExc_TypeError,
                         "__array_ufunc__ requires at least 2 arguments");
         return NULL;
@@ -988,11 +991,11 @@ array_ufunc(PyArrayObject *self, PyObject *args, PyObject *kwds)
         return NULL;
     }
     /* ndarray cannot handle overrides itself */
-    num_override_args = PyUFunc_WithOverride(normal_args, kwds, NULL, NULL);
-    if (num_override_args == -1) {
-        return NULL;
+    has_override = PyUFunc_HasOverride(normal_args, kwds);
+    if (has_override < 0) {
+        goto cleanup;
     }
-    if (num_override_args) {
+    else if (has_override) {
         result = Py_NotImplemented;
         Py_INCREF(Py_NotImplemented);
         goto cleanup;
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index f78a748c0..6e57f1d6d 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -800,102 +800,6 @@ PyArray_CanCoerceScalar(int thistype, int neededtype,
     return 0;
 }
 
-/*
- * Make a new empty array, of the passed size, of a type that takes the
- * priority of ap1 and ap2 into account.
- *
- * If `out` is non-NULL, memory overlap is checked with ap1 and ap2, and an
- * updateifcopy temporary array may be returned. If `result` is non-NULL, the
- * output array to be returned (`out` if non-NULL and the newly allocated array
- * otherwise) is incref'd and put to *result.
- */
-static PyArrayObject *
-new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
-                  int nd, npy_intp dimensions[], int typenum, PyArrayObject **result)
-{
-    PyArrayObject *out_buf;
-
-    if (out) {
-        int d;
-
-        /* verify that out is usable */
-        if (PyArray_NDIM(out) != nd ||
-            PyArray_TYPE(out) != typenum ||
-            !PyArray_ISCARRAY(out)) {
-            PyErr_SetString(PyExc_ValueError,
-                "output array is not acceptable (must have the right datatype, "
-                "number of dimensions, and be a C-Array)");
-            return 0;
-        }
-        for (d = 0; d < nd; ++d) {
-            if (dimensions[d] != PyArray_DIM(out, d)) {
-                PyErr_SetString(PyExc_ValueError,
-                    "output array has wrong dimensions");
-                return 0;
-            }
-        }
-
-        /* check for memory overlap */
-        if (!(solve_may_share_memory(out, ap1, 1) == 0 &&
-              solve_may_share_memory(out, ap2, 1) == 0)) {
-            /* allocate temporary output array */
-            out_buf = (PyArrayObject *)PyArray_NewLikeArray(out, NPY_CORDER,
-                                                            NULL, 0);
-            if (out_buf == NULL) {
-                return NULL;
-            }
-
-            /* set copy-back */
-            Py_INCREF(out);
-            if (PyArray_SetWritebackIfCopyBase(out_buf, out) < 0) {
-                Py_DECREF(out);
-                Py_DECREF(out_buf);
-                return NULL;
-            }
-        }
-        else {
-            Py_INCREF(out);
-            out_buf = out;
-        }
-
-        if (result) {
-            Py_INCREF(out);
-            *result = out;
-        }
-
-        return out_buf;
-    }
-    else {
-        PyTypeObject *subtype;
-        double prior1, prior2;
-        /*
-         * Need to choose an output array that can hold a sum
-         * -- use priority to determine which subtype.
-         */
-        if (Py_TYPE(ap2) != Py_TYPE(ap1)) {
-            prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
-            prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
-            subtype = (prior2 > prior1 ? Py_TYPE(ap2) : Py_TYPE(ap1));
-        }
-        else {
-            prior1 = prior2 = 0.0;
-            subtype = Py_TYPE(ap1);
-        }
-
-        out_buf = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
-                                               typenum, NULL, NULL, 0, 0,
-                                               (PyObject *)
-                                               (prior2 > prior1 ? ap2 : ap1));
-
-        if (out_buf != NULL && result) {
-            Py_INCREF(out_buf);
-            *result = out_buf;
-        }
-
-        return out_buf;
-    }
-}
-
 /* Could perhaps be redone to not make contiguous arrays */
 
 /*NUMPY_API
@@ -1101,7 +1005,7 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(ap2));
     while (it1->index < it1->size) {
         while (it2->index < it2->size) {
-            dot(it1->dataptr, is1, it2->dataptr, is2, op, l, out_buf);
+            dot(it1->dataptr, is1, it2->dataptr, is2, op, l, NULL);
             op += os;
             PyArray_ITER_NEXT(it2);
         }
@@ -4441,7 +4345,7 @@ static struct PyMethodDef array_module_methods[] = {
         "indicated by mask."},
     {"bincount", (PyCFunction)arr_bincount,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"digitize", (PyCFunction)arr_digitize,
+    {"_monotonicity", (PyCFunction)arr__monotonicity,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"interp", (PyCFunction)arr_interp,
         METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index ba9e9f273..7a33ac05e 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -1381,47 +1381,6 @@ NpyIter_GetInnerLoopSizePtr(NpyIter *iter)
 }
 
 /*NUMPY_API
- * Resolves all writebackifcopy scratch buffers, not safe to use iterator
- * operands after this call, in this iterator as well as any copies.
- * Returns 0 on success, -1 on failure
- */
-NPY_NO_EXPORT int
-NpyIter_Close(NpyIter *iter)
-{
-    int ret=0, iop, nop;
-    PyArrayObject ** operands;
-    npyiter_opitflags *op_itflags;
-    if (iter == NULL) {
-        return 0;
-    }
-    nop = NIT_NOP(iter);
-    operands = NIT_OPERANDS(iter);
-    op_itflags = NIT_OPITFLAGS(iter);
-    /* If NPY_OP_ITFLAG_HAS_WRITEBACK flag set on operand, resolve it.
-     * If the resolution fails (should never happen), continue from the
-     * next operand and discard the writeback scratch buffers, and return
-     * failure status
-     */
-    for (iop=0; iop<nop; iop++) {
-        if (op_itflags[iop] & NPY_OP_ITFLAG_HAS_WRITEBACK) {
-            op_itflags[iop] &= ~NPY_OP_ITFLAG_HAS_WRITEBACK;
-            if (PyArray_ResolveWritebackIfCopy(operands[iop]) < 0) {
-                ret = -1;
-                iop++;
-                break;
-            }
-        }
-    }
-    for (; iop<nop; iop++) {
-        if (op_itflags[iop] & NPY_OP_ITFLAG_HAS_WRITEBACK) {
-            op_itflags[iop] &= ~NPY_OP_ITFLAG_HAS_WRITEBACK;
-            PyArray_DiscardWritebackIfCopy(operands[iop]);
-        }
-    }
-    return ret;
-}
-
-/*NUMPY_API
  * For debugging
  */
 NPY_NO_EXPORT void
@@ -2830,4 +2789,23 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
     }
     return count * (*reduce_innersize);
 }
+
+NPY_NO_EXPORT npy_bool
+npyiter_has_writeback(NpyIter *iter)
+{
+    int iop, nop;
+    npyiter_opitflags *op_itflags;
+    if (iter == NULL) {
+        return 0;
+    }
+    nop = NIT_NOP(iter);
+    op_itflags = NIT_OPITFLAGS(iter);
+
+    for (iop=0; iop<nop; iop++) {
+        if (op_itflags[iop] & NPY_OP_ITFLAG_HAS_WRITEBACK) {
+            return NPY_TRUE;
+        }
+    }
+    return NPY_FALSE;
+}
 #undef NPY_ITERATOR_IMPLEMENTATION_CODE
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index b07137858..c56376f58 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -403,7 +403,6 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
      */
     if (!npyiter_allocate_arrays(iter, flags, op_dtype, subtype, op_flags,
                             op_itflags, op_axes)) {
-        NpyIter_Close(iter);
         NpyIter_Deallocate(iter);
         return NULL;
     }
@@ -465,14 +464,12 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
     /* If buffering is set without delayed allocation */
     if (itflags & NPY_ITFLAG_BUFFER) {
         if (!npyiter_allocate_transfer_functions(iter)) {
-            NpyIter_Close(iter);
             NpyIter_Deallocate(iter);
             return NULL;
         }
         if (!(itflags & NPY_ITFLAG_DELAYBUF)) {
             /* Allocate the buffers */
             if (!npyiter_allocate_buffers(iter, NULL)) {
-                NpyIter_Close(iter);
                 NpyIter_Deallocate(iter);
                 return NULL;
             }
@@ -654,6 +651,8 @@ NpyIter_Deallocate(NpyIter *iter)
     int iop, nop;
     PyArray_Descr **dtype;
     PyArrayObject **object;
+    npyiter_opitflags *op_itflags;
+    npy_bool resolve = 1;
 
     if (iter == NULL) {
         return NPY_SUCCEED;
@@ -663,6 +662,7 @@ NpyIter_Deallocate(NpyIter *iter)
     nop = NIT_NOP(iter);
     dtype = NIT_DTYPES(iter);
     object = NIT_OPERANDS(iter);
+    op_itflags = NIT_OPITFLAGS(iter);
 
     /* Deallocate any buffers and buffering data */
     if (itflags & NPY_ITFLAG_BUFFER) {
@@ -691,15 +691,28 @@ NpyIter_Deallocate(NpyIter *iter)
         }
     }
 
-    /* Deallocate all the dtypes and objects that were iterated */
+    /*
+     * Deallocate all the dtypes and objects that were iterated and resolve
+     * any writeback buffers created by the iterator
+     */
     for(iop = 0; iop < nop; ++iop, ++dtype, ++object) {
+        if (op_itflags[iop] & NPY_OP_ITFLAG_HAS_WRITEBACK) {
+            if (resolve && PyArray_ResolveWritebackIfCopy(*object) < 0) {
+                resolve = 0;
+            }
+            else {
+                PyArray_DiscardWritebackIfCopy(*object);
+            }
+        }
         Py_XDECREF(*dtype);
         Py_XDECREF(*object);
     }
 
     /* Deallocate the iterator memory */
     PyObject_Free(iter);
-
+    if (resolve == 0) {
+        return NPY_FAIL;
+    }
     return NPY_SUCCEED;
 }
 
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 0b6c80c8a..5a9f3c5fa 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -1,5 +1,5 @@
 /*
- * This file implements the CPython wrapper of the new NumPy iterator.
+ * This file implements the CPython wrapper of NpyIter
  *
  * Copyright (c) 2010 by Mark Wiebe (mwwiebe@gmail.com)
  * The University of British Columbia
@@ -19,6 +19,10 @@
 #include "common.h"
 #include "ctors.h"
 
+/* Functions not part of the public NumPy C API */
+npy_bool npyiter_has_writeback(NpyIter *iter);
+
+
 typedef struct NewNpyArrayIterObject_tag NewNpyArrayIterObject;
 
 struct NewNpyArrayIterObject_tag {
@@ -27,8 +31,6 @@ struct NewNpyArrayIterObject_tag {
     NpyIter *iter;
     /* Flag indicating iteration started/stopped */
     char started, finished;
-    /* iter operands cannot be referenced if iter is closed */
-    npy_bool is_closed;
     /* Child to update for nested iteration */
     NewNpyArrayIterObject *nested_child;
     /* Cached values from the iterator */
@@ -88,7 +90,6 @@ npyiter_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
     if (self != NULL) {
         self->iter = NULL;
         self->nested_child = NULL;
-        self->is_closed = 0;
     }
 
     return (PyObject *)self;
@@ -1173,10 +1174,29 @@ fail:
     return NULL;
 }
 
+
 static void
 npyiter_dealloc(NewNpyArrayIterObject *self)
 {
     if (self->iter) {
+        if (npyiter_has_writeback(self->iter)) {
+            if (PyErr_WarnEx(PyExc_RuntimeWarning,
+                    "Temporary data has not been written back to one of the "
+                    "operands. Typically nditer is used as a context manager "
+                    "otherwise 'close' must be called before reading iteration "
+                    "results.", 1) < 0) {
+                PyObject *s;
+
+                s = PyUString_FromString("npyiter_dealloc");
+                if (s) {
+                    PyErr_WriteUnraisable(s);
+                    Py_DECREF(s);
+                }
+                else {
+                    PyErr_WriteUnraisable(Py_None);
+                }
+            }
+        }
         NpyIter_Deallocate(self->iter);
         self->iter = NULL;
         Py_XDECREF(self->nested_child);
@@ -1418,12 +1438,6 @@ static PyObject *npyiter_value_get(NewNpyArrayIterObject *self)
         ret = npyiter_seq_item(self, 0);
     }
     else {
-        if (self->is_closed) {
-            PyErr_SetString(PyExc_ValueError,
-                    "Iterator is closed");
-            return NULL;
-        }
-
         ret = PyTuple_New(nop);
         if (ret == NULL) {
             return NULL;
@@ -1453,12 +1467,6 @@ static PyObject *npyiter_operands_get(NewNpyArrayIterObject *self)
                 "Iterator is invalid");
         return NULL;
     }
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError,
-                "Iterator is closed");
-        return NULL;
-    }
-
     nop = NpyIter_GetNOp(self->iter);
     operands = self->operands;
 
@@ -1487,13 +1495,6 @@ static PyObject *npyiter_itviews_get(NewNpyArrayIterObject *self)
                 "Iterator is invalid");
         return NULL;
     }
-
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError,
-                "Iterator is closed");
-        return NULL;
-    }
-
     nop = NpyIter_GetNOp(self->iter);
 
     ret = PyTuple_New(nop);
@@ -1517,7 +1518,7 @@ static PyObject *
 npyiter_next(NewNpyArrayIterObject *self)
 {
     if (self->iter == NULL || self->iternext == NULL ||
-                self->finished || self->is_closed) {
+                self->finished) {
         return NULL;
     }
 
@@ -1911,13 +1912,6 @@ static PyObject *npyiter_dtypes_get(NewNpyArrayIterObject *self)
                 "Iterator is invalid");
         return NULL;
     }
-
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError,
-                "Iterator is closed");
-        return NULL;
-    }
-
     nop = NpyIter_GetNOp(self->iter);
 
     ret = PyTuple_New(nop);
@@ -2011,13 +2005,6 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
                 "and no reset has been done yet");
         return NULL;
     }
-
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError,
-                "Iterator is closed");
-        return NULL;
-    }
-
     nop = NpyIter_GetNOp(self->iter);
 
     /* Negative indexing */
@@ -2090,13 +2077,6 @@ npyiter_seq_slice(NewNpyArrayIterObject *self,
                 "and no reset has been done yet");
         return NULL;
     }
-
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError,
-                "Iterator is closed");
-        return NULL;
-    }
-
     nop = NpyIter_GetNOp(self->iter);
     if (ilow < 0) {
         ilow = 0;
@@ -2156,13 +2136,6 @@ npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
                 "and no reset has been done yet");
         return -1;
     }
-
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError,
-                "Iterator is closed");
-        return -1;
-    }
-
     nop = NpyIter_GetNOp(self->iter);
 
     /* Negative indexing */
@@ -2234,13 +2207,6 @@ npyiter_seq_ass_slice(NewNpyArrayIterObject *self, Py_ssize_t ilow,
                 "and no reset has been done yet");
         return -1;
     }
-
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError,
-                "Iterator is closed");
-        return -1;
-    }
-
     nop = NpyIter_GetNOp(self->iter);
     if (ilow < 0) {
         ilow = 0;
@@ -2292,12 +2258,6 @@ npyiter_subscript(NewNpyArrayIterObject *self, PyObject *op)
         return NULL;
     }
 
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError,
-                "Iterator is closed");
-        return NULL;
-    }
-
     if (PyInt_Check(op) || PyLong_Check(op) ||
                     (PyIndex_Check(op) && !PySequence_Check(op))) {
         npy_intp i = PyArray_PyIntAsIntp(op);
@@ -2347,12 +2307,6 @@ npyiter_ass_subscript(NewNpyArrayIterObject *self, PyObject *op,
         return -1;
     }
 
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError,
-                "Iterator is closed");
-        return -1;
-    }
-
     if (PyInt_Check(op) || PyLong_Check(op) ||
                     (PyIndex_Check(op) && !PySequence_Check(op))) {
         npy_intp i = PyArray_PyIntAsIntp(op);
@@ -2387,10 +2341,6 @@ npyiter_enter(NewNpyArrayIterObject *self)
         PyErr_SetString(PyExc_RuntimeError, "operation on non-initialized iterator");
         return NULL;
     }
-    if (self->is_closed) {
-        PyErr_SetString(PyExc_ValueError, "cannot reuse closed iterator");
-        return NULL;
-    }
     Py_INCREF(self);
     return (PyObject *)self;
 }
@@ -2403,8 +2353,8 @@ npyiter_close(NewNpyArrayIterObject *self)
     if (self->iter == NULL) {
         Py_RETURN_NONE;
     }
-    ret = NpyIter_Close(iter);
-    self->is_closed = 1;
+    ret = NpyIter_Deallocate(iter);
+    self->iter = NULL;
     if (ret < 0) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 448d2d9c2..f71d39405 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -15,6 +15,7 @@
 #include "temp_elide.h"
 
 #include "binop_override.h"
+#include "ufunc_override.h"
 
 /*************************************************************************
  ****************   Implement Number Protocol ****************************
@@ -550,6 +551,50 @@ array_power(PyArrayObject *a1, PyObject *o2, PyObject *modulo)
     return value;
 }
 
+static PyObject *
+array_positive(PyArrayObject *m1)
+{
+    /*
+     * For backwards compatibility, where + just implied a copy,
+     * we cannot just call n_ops.positive.  Instead, we do the following
+     * 1. Try n_ops.positive
+     * 2. If we get an exception, check whether __array_ufunc__ is
+     *    overridden; if so, we live in the future and we allow the
+     *    TypeError to be passed on.
+     * 3. If not, give a deprecation warning and return a copy.
+     */
+    PyObject *value;
+    if (can_elide_temp_unary(m1)) {
+        value = PyArray_GenericInplaceUnaryFunction(m1, n_ops.positive);
+    }
+    else {
+        value = PyArray_GenericUnaryFunction(m1, n_ops.positive);
+    }
+    if (value == NULL) {
+        /*
+         * We first fetch the error, as it needs to be clear to check
+         * for the override.  When the deprecation is removed,
+         * this whole stanza can be deleted.
+         */
+        PyObject *exc, *val, *tb;
+        PyErr_Fetch(&exc, &val, &tb);
+        if (has_non_default_array_ufunc((PyObject *)m1)) {
+            PyErr_Restore(exc, val, tb);
+            return NULL;
+        }
+        /* 2018-06-28, 1.16.0 */
+        if (DEPRECATE("Applying '+' to a non-numerical array is "
+                      "ill-defined. Returning a copy, but in the future "
+                      "this will error.") < 0) {
+            return NULL;
+        }
+        Py_XDECREF(exc);
+        Py_XDECREF(val);
+        Py_XDECREF(tb);
+        value = PyArray_Return((PyArrayObject *)PyArray_Copy(m1));
+    }
+    return value;
+}
 
 static PyObject *
 array_negative(PyArrayObject *m1)
@@ -927,12 +972,6 @@ array_hex(PyArrayObject *v)
 #endif
 
 static PyObject *
-_array_copy_nice(PyArrayObject *self)
-{
-    return PyArray_Return((PyArrayObject *) PyArray_Copy(self));
-}
-
-static PyObject *
 array_index(PyArrayObject *v)
 {
     if (!PyArray_ISINTEGER(v) || PyArray_NDIM(v) != 0) {
@@ -955,7 +994,7 @@ NPY_NO_EXPORT PyNumberMethods array_as_number = {
     (binaryfunc)array_divmod,                   /*nb_divmod*/
     (ternaryfunc)array_power,                   /*nb_power*/
     (unaryfunc)array_negative,                  /*nb_neg*/
-    (unaryfunc)_array_copy_nice,                /*nb_pos*/
+    (unaryfunc)array_positive,                  /*nb_pos*/
     (unaryfunc)array_absolute,                  /*(unaryfunc)array_abs,*/
     (inquiry)_array_nonzero,                    /*nb_nonzero*/
     (unaryfunc)array_invert,                    /*nb_invert*/
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 25e0668ed..a32aa47ab 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -1675,16 +1675,6 @@ gentype_itemset(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
     return NULL;
 }
 
-static PyObject *
-gentype_squeeze(PyObject *self, PyObject *args)
-{
-    if (!PyArg_ParseTuple(args, "")) {
-        return NULL;
-    }
-    Py_INCREF(self);
-    return self;
-}
-
 static Py_ssize_t
 gentype_getreadbuf(PyObject *, Py_ssize_t, void **);
 
@@ -1738,7 +1728,7 @@ gentype_byteswap(PyObject *self, PyObject *args, PyObject *kwds)
  *         std, var, sum, cumsum, prod, cumprod, compress, sort, argsort,
  *         round, argmax, argmin, max, min, ptp, any, all, astype, resize,
  *         reshape, choose, tostring, tobytes, copy, searchsorted, view,
- *         flatten, ravel#
+ *         flatten, ravel, squeeze#
  */
 static PyObject *
 gentype_@name@(PyObject *self, PyObject *args, PyObject *kwds)
@@ -2121,7 +2111,7 @@ static PyMethodDef gentype_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"squeeze",
         (PyCFunction)gentype_squeeze,
-        METH_VARARGS, NULL},
+        METH_VARARGS | METH_KEYWORDS, NULL},
     {"view",
         (PyCFunction)gentype_view,
         METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h
index 094612b7d..f9c671f77 100644
--- a/numpy/core/src/private/lowlevel_strided_loops.h
+++ b/numpy/core/src/private/lowlevel_strided_loops.h
@@ -689,21 +689,16 @@ npy_bswap8_unaligned(char * x)
 #define PyArray_TRIVIALLY_ITERABLE_OP_NOREAD 0
 #define PyArray_TRIVIALLY_ITERABLE_OP_READ 1
 
-#define PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) (            \
-                        PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
-                        PyArray_CompareLists(PyArray_DIMS(arr1), \
-                                             PyArray_DIMS(arr2), \
-                                             PyArray_NDIM(arr1)) && \
-                        (PyArray_FLAGS(arr1)&(NPY_ARRAY_C_CONTIGUOUS| \
-                                      NPY_ARRAY_F_CONTIGUOUS)) & \
-                                (PyArray_FLAGS(arr2)&(NPY_ARRAY_C_CONTIGUOUS| \
-                                              NPY_ARRAY_F_CONTIGUOUS)) \
-                        )
+#define PyArray_TRIVIALLY_ITERABLE(arr) ( \
+                    PyArray_NDIM(arr) <= 1 || \
+                    PyArray_CHKFLAGS(arr, NPY_ARRAY_C_CONTIGUOUS) || \
+                    PyArray_CHKFLAGS(arr, NPY_ARRAY_F_CONTIGUOUS) \
+                    )
 
 #define PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size, arr) ( \
-                        size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
-                                          PyArray_STRIDE(arr, 0) : \
-                                          PyArray_ITEMSIZE(arr)))
+        assert(PyArray_TRIVIALLY_ITERABLE(arr)), \
+        size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
+                             PyArray_STRIDE(arr, 0) : PyArray_ITEMSIZE(arr)))
 
 static NPY_INLINE int
 PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr2,
@@ -757,15 +752,22 @@ PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr
     return (!arr1_read || arr1_ahead) && (!arr2_read || arr2_ahead);
 }
 
+#define PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) (            \
+                        PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
+                        PyArray_CompareLists(PyArray_DIMS(arr1), \
+                                             PyArray_DIMS(arr2), \
+                                             PyArray_NDIM(arr1)) && \
+                        (PyArray_FLAGS(arr1)&(NPY_ARRAY_C_CONTIGUOUS| \
+                                      NPY_ARRAY_F_CONTIGUOUS)) & \
+                                (PyArray_FLAGS(arr2)&(NPY_ARRAY_C_CONTIGUOUS| \
+                                              NPY_ARRAY_F_CONTIGUOUS)) \
+                        )
+
 #define PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2, arr1_read, arr2_read) ( \
                         PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) && \
                         PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK( \
                             arr1, arr2, arr1_read, arr2_read))
-#define PyArray_TRIVIALLY_ITERABLE(arr) ( \
-                    PyArray_NDIM(arr) <= 1 || \
-                    PyArray_CHKFLAGS(arr, NPY_ARRAY_C_CONTIGUOUS) || \
-                    PyArray_CHKFLAGS(arr, NPY_ARRAY_F_CONTIGUOUS) \
-                    )
+
 #define PyArray_PREPARE_TRIVIAL_ITERATION(arr, count, data, stride) \
                     count = PyArray_SIZE(arr); \
                     data = PyArray_BYTES(arr); \
@@ -774,7 +776,6 @@ PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr
                                             PyArray_STRIDE(arr, 0) : \
                                             PyArray_ITEMSIZE(arr)));
 
-
 #define PyArray_TRIVIALLY_ITERABLE_PAIR(arr1, arr2, arr1_read, arr2_read) (   \
                     PyArray_TRIVIALLY_ITERABLE(arr1) && \
                         (PyArray_NDIM(arr2) == 0 || \
diff --git a/numpy/core/src/private/npy_config.h b/numpy/core/src/private/npy_config.h
index 107b3cb5b..8143e7719 100644
--- a/numpy/core/src/private/npy_config.h
+++ b/numpy/core/src/private/npy_config.h
@@ -15,7 +15,8 @@
  * amd64 is not harmed much by the bloat as the system provides 16 byte
  * alignment by default.
  */
-#if (defined NPY_CPU_X86 || defined _WIN32)
+#if (defined NPY_CPU_X86 || defined _WIN32 || defined NPY_CPU_ARMEL_AARCH32 ||\
+     defined NPY_CPU_ARMEB_AARCH32)
 #define NPY_MAX_COPY_ALIGNMENT 8
 #else
 #define NPY_MAX_COPY_ALIGNMENT 16
diff --git a/numpy/core/src/private/ufunc_override.c b/numpy/core/src/private/ufunc_override.c
index e405155cf..33b54c665 100644
--- a/numpy/core/src/private/ufunc_override.c
+++ b/numpy/core/src/private/ufunc_override.c
@@ -22,7 +22,7 @@
  * nor to the default __array_ufunc__ method, so instead we import locally.
  * TODO: Can this really not be done more smartly?
  */
-static PyObject *
+NPY_NO_EXPORT PyObject *
 get_non_default_array_ufunc(PyObject *obj)
 {
     static PyObject *ndarray = NULL;
@@ -54,11 +54,71 @@ get_non_default_array_ufunc(PyObject *obj)
 }
 
 /*
- * Check whether a set of input and output args have a non-default
- *  `__array_ufunc__` method. Return the number of overrides, setting
- * corresponding objects in PyObject array with_override and the corresponding
- * __array_ufunc__ methods in methods (both only if not NULL, and both using
- * new references).
+ * Check whether an object has __array_ufunc__ defined on its class and it
+ * is not the default, i.e., the object is not an ndarray, and its
+ * __array_ufunc__ is not the same as that of ndarray.
+ *
+ * Returns 1 if this is the case, 0 if not.
+ */
+
+NPY_NO_EXPORT int
+has_non_default_array_ufunc(PyObject * obj)
+{
+    PyObject *method = get_non_default_array_ufunc(obj);
+    if (method) {
+        Py_DECREF(method);
+        return 1;
+    }
+    else {
+        return 0;
+    }
+}
+
+/*
+ * Get possible out argument from kwds, and returns the number of outputs
+ * contained within it: if a tuple, the number of elements in it, 1 otherwise.
+ * The out argument itself is returned in out_kwd_obj, and the outputs
+ * in the out_obj array (all as borrowed references).
+ *
+ * Returns -1 if kwds is not a dict, 0 if no outputs found.
+ */
+static int
+get_out_objects(PyObject *kwds, PyObject **out_kwd_obj, PyObject ***out_objs)
+{
+    if (kwds == NULL) {
+        return 0;
+    }
+    if (!PyDict_CheckExact(kwds)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Internal Numpy error: call to PyUFunc_WithOverride "
+                        "with non-dict kwds");
+        return -1;
+    }
+    /* borrowed reference */
+    *out_kwd_obj = PyDict_GetItemString(kwds, "out");
+    if (*out_kwd_obj == NULL) {
+        return 0;
+    }
+    if (PyTuple_CheckExact(*out_kwd_obj)) {
+        *out_objs = PySequence_Fast_ITEMS(*out_kwd_obj);
+        return PySequence_Fast_GET_SIZE(*out_kwd_obj);
+    }
+    else {
+        *out_objs = out_kwd_obj;
+        return 1;
+    }
+}
+
+/*
+ * For each positional argument and each argument in a possible "out"
+ * keyword, look for overrides of the standard ufunc behaviour, i.e.,
+ * non-default __array_ufunc__ methods.
+ *
+ * Returns the number of overrides, setting corresponding objects
+ * in PyObject array ``with_override`` and the corresponding
+ * __array_ufunc__ methods in ``methods`` (both using new references).
+ *
+ * Only the first override for a given class is returned.
  *
  * returns -1 on failure.
  */
@@ -67,64 +127,52 @@ PyUFunc_WithOverride(PyObject *args, PyObject *kwds,
                      PyObject **with_override, PyObject **methods)
 {
     int i;
-
-    int nargs;
-    int nout_kwd = 0;
-    int out_kwd_is_tuple = 0;
     int num_override_args = 0;
+    int narg, nout = 0;
+    PyObject *out_kwd_obj;
+    PyObject **arg_objs, **out_objs;
 
-    PyObject *obj;
-    PyObject *out_kwd_obj = NULL;
-    /*
-     * Check inputs
-     */
-    if (!PyTuple_Check(args)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "Internal Numpy error: call to PyUFunc_HasOverride "
-                        "with non-tuple");
-        goto fail;
+    narg = PyTuple_Size(args);
+    if (narg < 0) {
+        return -1;
     }
-    nargs = PyTuple_GET_SIZE(args);
-    if (nargs > NPY_MAXARGS) {
-        PyErr_SetString(PyExc_TypeError,
-                        "Internal Numpy error: too many arguments in call "
-                        "to PyUFunc_HasOverride");
-        goto fail;
-    }
-    /* be sure to include possible 'out' keyword argument. */
-    if (kwds && PyDict_CheckExact(kwds)) {
-        out_kwd_obj = PyDict_GetItemString(kwds, "out");
-        if (out_kwd_obj != NULL) {
-            out_kwd_is_tuple = PyTuple_CheckExact(out_kwd_obj);
-            if (out_kwd_is_tuple) {
-                nout_kwd = PyTuple_GET_SIZE(out_kwd_obj);
-            }
-            else {
-                nout_kwd = 1;
-            }
-        }
+    arg_objs = PySequence_Fast_ITEMS(args);
+
+    nout = get_out_objects(kwds, &out_kwd_obj, &out_objs);
+    if (nout < 0) {
+        return -1;
     }
 
-    for (i = 0; i < nargs + nout_kwd; ++i) {
-        PyObject *method;
-        if (i < nargs) {
-            obj = PyTuple_GET_ITEM(args, i);
+    for (i = 0; i < narg + nout; ++i) {
+        PyObject *obj;
+        int j;
+        int new_class = 1;
+
+        if (i < narg) {
+            obj = arg_objs[i];
         }
         else {
-            if (out_kwd_is_tuple) {
-                obj = PyTuple_GET_ITEM(out_kwd_obj, i - nargs);
-            }
-            else {
-                obj = out_kwd_obj;
-            }
+            obj = out_objs[i - narg];
         }
         /*
-         * Now see if the object provides an __array_ufunc__. However, we should
-         * ignore the base ndarray.__ufunc__, so we skip any ndarray as well as
-         * any ndarray subclass instances that did not override __array_ufunc__.
+         * Have we seen this class before?  If so, ignore.
          */
-        method = get_non_default_array_ufunc(obj);
-        if (method != NULL) {
+        for (j = 0; j < num_override_args; j++) {
+            new_class = (Py_TYPE(obj) != Py_TYPE(with_override[j]));
+            if (!new_class) {
+                break;
+            }
+        }
+        if (new_class) {
+            /*
+             * Now see if the object provides an __array_ufunc__. However, we should
+             * ignore the base ndarray.__ufunc__, so we skip any ndarray as well as
+             * any ndarray subclass instances that did not override __array_ufunc__.
+             */
+            PyObject *method = get_non_default_array_ufunc(obj);
+            if (method == NULL) {
+                continue;
+            }
             if (method == Py_None) {
                 PyErr_Format(PyExc_TypeError,
                              "operand '%.200s' does not support ufuncs "
@@ -133,23 +181,61 @@ PyUFunc_WithOverride(PyObject *args, PyObject *kwds,
                 Py_DECREF(method);
                 goto fail;
             }
-            if (with_override != NULL) {
-                Py_INCREF(obj);
-                with_override[num_override_args] = obj;
-            }
-            if (methods != NULL) {
-                methods[num_override_args] = method;
-            }
+            Py_INCREF(obj);
+            with_override[num_override_args] = obj;
+            methods[num_override_args] = method;
             ++num_override_args;
         }
     }
     return num_override_args;
 
 fail:
-    if (methods != NULL) {
-        for (i = 0; i < num_override_args; i++) {
-            Py_XDECREF(methods[i]);
-        }
+    for (i = 0; i < num_override_args; i++) {
+        Py_DECREF(with_override[i]);
+        Py_DECREF(methods[i]);
     }
     return -1;
 }
+
+/*
+ * Check whether any of a set of input and output args have a non-default
+ * __array_ufunc__ method. Return 1 if so, 0 if not.
+ *
+ * This function primarily exists to help ndarray.__array_ufunc__ determine
+ * whether it can support a ufunc (which is the case only if none of the
+ * operands have an override).  Thus, unlike in PyUFunc_CheckOverride, the
+ * actual overrides are not needed and one can stop looking once one is found.
+ *
+ * TODO: move this function and has_non_default_array_ufunc closer to ndarray.
+ */
+NPY_NO_EXPORT int
+PyUFunc_HasOverride(PyObject *args, PyObject *kwds)
+{
+    int i;
+    int nin, nout;
+    PyObject *out_kwd_obj;
+    PyObject **in_objs, **out_objs;
+
+    /* check inputs */
+    nin = PyTuple_Size(args);
+    if (nin < 0) {
+        return -1;
+    }
+    in_objs = PySequence_Fast_ITEMS(args);
+    for (i = 0; i < nin; ++i) {
+        if (has_non_default_array_ufunc(in_objs[i])) {
+            return 1;
+        }
+    }
+    /* check outputs, if any */
+    nout = get_out_objects(kwds, &out_kwd_obj, &out_objs);
+    if (nout < 0) {
+        return -1;
+    }
+    for (i = 0; i < nout; i++) {
+        if (has_non_default_array_ufunc(out_objs[i])) {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/numpy/core/src/private/ufunc_override.h b/numpy/core/src/private/ufunc_override.h
index 2ed1c626f..5b269d270 100644
--- a/numpy/core/src/private/ufunc_override.h
+++ b/numpy/core/src/private/ufunc_override.h
@@ -4,6 +4,34 @@
 #include "npy_config.h"
 
 /*
+ * Check whether an object has __array_ufunc__ defined on its class and it
+ * is not the default, i.e., the object is not an ndarray, and its
+ * __array_ufunc__ is not the same as that of ndarray.
+ *
+ * Returns a new reference, the value of type(obj).__array_ufunc__
+ *
+ * If the __array_ufunc__ matches that of ndarray, or does not exist, return
+ * NULL.
+ *
+ * Note that since this module is used with both multiarray and umath, we do
+ * not have access to PyArray_Type and therewith neither to PyArray_CheckExact
+ * nor to the default __array_ufunc__ method, so instead we import locally.
+ * TODO: Can this really not be done more smartly?
+ */
+NPY_NO_EXPORT PyObject *
+get_non_default_array_ufunc(PyObject *obj);
+
+/*
+ * Check whether an object has __array_ufunc__ defined on its class and it
+ * is not the default, i.e., the object is not an ndarray, and its
+ * __array_ufunc__ is not the same as that of ndarray.
+ *
+ * Returns 1 if this is the case, 0 if not.
+ */
+NPY_NO_EXPORT int
+has_non_default_array_ufunc(PyObject * obj);
+
+/*
  * Check whether a set of input and output args have a non-default
  *  `__array_ufunc__` method. Returns the number of overrides, setting
  * corresponding objects in PyObject array with_override (if not NULL).
@@ -12,4 +40,7 @@
 NPY_NO_EXPORT int
 PyUFunc_WithOverride(PyObject *args, PyObject *kwds,
                      PyObject **with_override, PyObject **methods);
+
+NPY_NO_EXPORT int
+PyUFunc_HasOverride(PyObject *args, PyObject *kwds);
 #endif
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 1ca298b30..0b02031a7 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1874,9 +1874,13 @@ NPY_NO_EXPORT void
     }
     else {
         BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
+            @type@ in1 = *(@type@ *)ip1;
             const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
+            in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
+            if (npy_isnan(in1)) {
+                npy_set_floatstatus_invalid();
+            }
+            *((@type@ *)op1) = in1;
         }
     }
 }
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index c0bc47b7b..4a381ba12 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -347,8 +347,6 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
     PyObject *with_override[NPY_MAXARGS];
     PyObject *array_ufunc_methods[NPY_MAXARGS];
 
-    PyObject *obj;
-    PyObject *other_obj;
     PyObject *out;
 
     PyObject *method_name = NULL;
@@ -511,21 +509,18 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
 
         /* Choose an overriding argument */
         for (i = 0; i < num_override_args; i++) {
-            obj = with_override[i];
-            if (obj == NULL) {
+            override_obj = with_override[i];
+            if (override_obj == NULL) {
                 continue;
             }
 
-            /* Get the first instance of an overriding arg.*/
-            override_obj = obj;
-
             /* Check for sub-types to the right of obj. */
             for (j = i + 1; j < num_override_args; j++) {
-                other_obj = with_override[j];
+                PyObject *other_obj = with_override[j];
                 if (other_obj != NULL &&
-                    PyObject_Type(other_obj) != PyObject_Type(obj) &&
+                    Py_TYPE(other_obj) != Py_TYPE(override_obj) &&
                     PyObject_IsInstance(other_obj,
-                                        PyObject_Type(override_obj))) {
+                                        (PyObject *)Py_TYPE(override_obj))) {
                     override_obj = NULL;
                     break;
                 }
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 5e92bc991..20c448d8b 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -71,6 +71,13 @@ typedef struct {
                        provided, then this is NULL. */
 } ufunc_full_args;
 
+/* C representation of the context argument to __array_wrap__ */
+typedef struct {
+    PyUFuncObject *ufunc;
+    ufunc_full_args args;
+    int out_i;
+} _ufunc_context;
+
 /* Get the arg tuple to pass in the context argument to __array_wrap__ and
  * __array_prepare__.
  *
@@ -303,6 +310,141 @@ _find_array_prepare(ufunc_full_args args,
 }
 
 
+/*
+ * This function analyzes the input arguments
+ * and determines an appropriate __array_wrap__ function to call
+ * for the outputs.
+ *
+ * If an output argument is provided, then it is wrapped
+ * with its own __array_wrap__ not with the one determined by
+ * the input arguments.
+ *
+ * if the provided output argument is already an array,
+ * the wrapping function is None (which means no wrapping will
+ * be done --- not even PyArray_Return).
+ *
+ * A NULL is placed in output_wrap for outputs that
+ * should just have PyArray_Return called.
+ */
+static void
+_find_array_wrap(ufunc_full_args args, PyObject *kwds,
+                PyObject **output_wrap, int nin, int nout)
+{
+    int i;
+    PyObject *obj;
+    PyObject *wrap = NULL;
+
+    /*
+     * If a 'subok' parameter is passed and isn't True, don't wrap but put None
+     * into slots with out arguments which means return the out argument
+     */
+    if (kwds != NULL && (obj = PyDict_GetItem(kwds,
+                                              npy_um_str_subok)) != NULL) {
+        if (obj != Py_True) {
+            /* skip search for wrap members */
+            goto handle_out;
+        }
+    }
+
+    /*
+     * Determine the wrapping function given by the input arrays
+     * (could be NULL).
+     */
+    wrap = _find_array_method(args.in, npy_um_str_array_wrap);
+
+    /*
+     * For all the output arrays decide what to do.
+     *
+     * 1) Use the wrap function determined from the input arrays
+     * This is the default if the output array is not
+     * passed in.
+     *
+     * 2) Use the __array_wrap__ method of the output object
+     * passed in. -- this is special cased for
+     * exact ndarray so that no PyArray_Return is
+     * done in that case.
+     */
+handle_out:
+    if (args.out == NULL) {
+        for (i = 0; i < nout; i++) {
+            Py_XINCREF(wrap);
+            output_wrap[i] = wrap;
+        }
+    }
+    else {
+        for (i = 0; i < nout; i++) {
+            output_wrap[i] = _get_output_array_method(
+                PyTuple_GET_ITEM(args.out, i), npy_um_str_array_wrap, wrap);
+        }
+    }
+
+    Py_XDECREF(wrap);
+    return;
+}
+
+
+/*
+ * Apply the __array_wrap__ function with the given array and content.
+ *
+ * Interprets wrap=None and wrap=NULL as intended by _find_array_wrap
+ *
+ * Steals a reference to obj and wrap.
+ * Pass context=NULL to indicate there is no context.
+ */
+static PyObject *
+_apply_array_wrap(
+            PyObject *wrap, PyArrayObject *obj, _ufunc_context const *context) {
+    if (wrap == NULL) {
+        /* default behavior */
+        return PyArray_Return(obj);
+    }
+    else if (wrap == Py_None) {
+        Py_DECREF(wrap);
+        return (PyObject *)obj;
+    }
+    else {
+        PyObject *res;
+        PyObject *py_context = NULL;
+
+        /* Convert the context object to a tuple, if present */
+        if (context == NULL) {
+            py_context = Py_None;
+            Py_INCREF(py_context);
+        }
+        else {
+            PyObject *args_tup;
+            /* Call the method with appropriate context */
+            args_tup = _get_wrap_prepare_args(context->args);
+            if (args_tup == NULL) {
+                goto fail;
+            }
+            py_context = Py_BuildValue("OOi",
+                context->ufunc, args_tup, context->out_i);
+            Py_DECREF(args_tup);
+            if (py_context == NULL) {
+                goto fail;
+            }
+        }
+        /* try __array_wrap__(obj, context) */
+        res = PyObject_CallFunctionObjArgs(wrap, obj, py_context, NULL);
+        Py_DECREF(py_context);
+
+        /* try __array_wrap__(obj) if the context argument is not accepted  */
+        if (res == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
+            PyErr_Clear();
+            res = PyObject_CallFunctionObjArgs(wrap, obj, NULL);
+        }
+        Py_DECREF(wrap);
+        Py_DECREF(obj);
+        return res;
+    fail:
+        Py_DECREF(wrap);
+        Py_DECREF(obj);
+        return NULL;
+    }
+}
+
+
 /*UFUNC_API
  *
  * On return, if errobj is populated with a non-NULL value, the caller
@@ -552,6 +694,181 @@ ufunc_get_name_cstr(PyUFuncObject *ufunc) {
 }
 
 /*
+ * Helpers for keyword parsing
+ */
+
+/*
+ * Find key in a list of pointers to keyword names.
+ * The list should end with NULL.
+ *
+ * Returns either the index into the list (pointing to the final key with NULL
+ * if no match was found), or -1 on failure.
+ */
+static npy_intp
+locate_key(PyObject **kwnames, PyObject *key)
+{
+    PyObject **kwname = kwnames;
+    while (*kwname != NULL && *kwname != key) {
+        kwname++;
+    }
+    /* Slow fallback, just in case */
+    if (*kwname == NULL) {
+        int cmp = 0;
+        kwname = kwnames;
+        while (*kwname != NULL &&
+               (cmp = PyObject_RichCompareBool(key, *kwname,
+                                               Py_EQ)) == 0) {
+            kwname++;
+        }
+        if (cmp < 0) {
+            return -1;
+        }
+    }
+    return kwname - kwnames;
+}
+
+/*
+ * Parse keyword arguments, matching against kwnames
+ *
+ * Arguments beyond kwnames (the va_list) should contain converters and outputs
+ * for each keyword name (where an output can be NULL to indicate the particular
+ * keyword should be ignored).
+ *
+ * Returns 0 on success, -1 on failure with an error set.
+ *
+ * Note that the parser does not clean up on failure, i.e., already parsed keyword
+ * values may hold new references, which the caller has to remove.
+ *
+ * TODO: ufunc is only used for the name in error messages; passing on the
+ *       name instead might be an option.
+ *
+ * TODO: instead of having this function ignore of keywords for which the
+ *       corresponding output is NULL, the calling routine should prepare the
+ *       correct list.
+ */
+static int
+parse_ufunc_keywords(PyUFuncObject *ufunc, PyObject *kwds, PyObject **kwnames, ...)
+{
+    va_list va;
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+    typedef int converter(PyObject *, void *);
+
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        int i;
+        converter *convert;
+        void *output = NULL;
+        npy_intp index = locate_key(kwnames, key);
+        if (index < 0) {
+            return -1;
+        }
+        if (kwnames[index]) {
+            va_start(va, kwnames);
+            for (i = 0; i <= index; i++) {
+                convert = va_arg(va, converter *);
+                output = va_arg(va, void *);
+            }
+            va_end(va);
+        }
+        if (output) {
+            if (!convert(value, output)) {
+                return -1;
+            }
+        }
+        else {
+#if PY_VERSION_HEX >= 0x03000000
+            PyErr_Format(PyExc_TypeError,
+                         "'%S' is an invalid keyword to ufunc '%s'",
+                         key, ufunc_get_name_cstr(ufunc));
+#else
+            char *str = PyString_AsString(key);
+            if (str == NULL) {
+                PyErr_Clear();
+                PyErr_SetString(PyExc_TypeError, "invalid keyword argument");
+            }
+            else {
+                PyErr_Format(PyExc_TypeError,
+                             "'%s' is an invalid keyword to ufunc '%s'",
+                             str, ufunc_get_name_cstr(ufunc));
+            }
+#endif
+            return -1;
+        }
+    }
+    return 0;
+}
+
+/*
+ * Converters for use in parsing of keywords arguments.
+ */
+NPY_NO_EXPORT int
+_subok_converter(PyObject *obj, int *subok)
+{
+    if (PyBool_Check(obj)) {
+        *subok = (obj == Py_True);
+        return NPY_SUCCEED;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "'subok' must be a boolean");
+        return NPY_FAIL;
+    }
+}
+
+NPY_NO_EXPORT int
+_keepdims_converter(PyObject *obj, int *keepdims)
+{
+    if (PyBool_Check(obj)) {
+        *keepdims = (obj == Py_True);
+        return NPY_SUCCEED;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "'keepdims' must be a boolean");
+        return NPY_FAIL;
+    }
+}
+
+NPY_NO_EXPORT int
+_wheremask_converter(PyObject *obj, PyArrayObject **wheremask)
+{
+    /*
+     * Optimization: where=True is the same as no where argument.
+     * This lets us document True as the default.
+     */
+    if (obj == Py_True) {
+        return NPY_SUCCEED;
+    }
+    else {
+        PyArray_Descr *dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (dtype == NULL) {
+            return NPY_FAIL;
+        }
+        /* PyArray_FromAny steals reference to dtype, even on failure */
+        *wheremask = (PyArrayObject *)PyArray_FromAny(obj, dtype, 0, 0, 0, NULL);
+        if ((*wheremask) == NULL) {
+            return NPY_FAIL;
+        }
+        return NPY_SUCCEED;
+    }
+}
+
+NPY_NO_EXPORT int
+_new_reference(PyObject *obj, PyObject **out)
+{
+    Py_INCREF(obj);
+    *out = obj;
+    return NPY_SUCCEED;
+}
+
+NPY_NO_EXPORT int
+_borrowed_reference(PyObject *obj, PyObject **out)
+{
+    *out = obj;
+    return NPY_SUCCEED;
+}
+
+/*
  * Parses the positional and keyword arguments for a generic ufunc call.
  * All returned arguments are new references (with optional ones NULL
  * if not present)
@@ -575,15 +892,9 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
     int nout = ufunc->nout;
     int nop = ufunc->nargs;
     PyObject *obj, *context;
-    PyObject *str_key_obj = NULL;
-    const char *ufunc_name = ufunc_get_name_cstr(ufunc);
-    int type_num;
-
-    int any_flexible = 0, any_object = 0, any_flexible_userloops = 0;
-    int has_sig = 0;
-
+    PyArray_Descr *dtype = NULL;
     /*
-     * Initialize objects so caller knows when outputs and other optional
+     * Initialize output objects so caller knows when outputs and optional
      * arguments are set (also means we can safely XDECREF on failure).
      */
     for (i = 0; i < nop; i++) {
@@ -638,166 +949,6 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
         if (out_op[i] == NULL) {
             goto fail;
         }
-
-        type_num = PyArray_DESCR(out_op[i])->type_num;
-        if (!any_flexible &&
-                PyTypeNum_ISFLEXIBLE(type_num)) {
-            any_flexible = 1;
-        }
-        if (!any_object &&
-                PyTypeNum_ISOBJECT(type_num)) {
-            any_object = 1;
-        }
-
-        /*
-         * If any operand is a flexible dtype, check to see if any
-         * struct dtype ufuncs are registered. A ufunc has been registered
-         * for a struct dtype if ufunc's arg_dtypes array is not NULL.
-         */
-        if (PyTypeNum_ISFLEXIBLE(type_num) &&
-                    !any_flexible_userloops &&
-                    ufunc->userloops != NULL) {
-                PyUFunc_Loop1d *funcdata;
-                PyObject *key, *obj;
-                key = PyInt_FromLong(type_num);
-            if (key == NULL) {
-                continue;
-            }
-            obj = PyDict_GetItem(ufunc->userloops, key);
-            Py_DECREF(key);
-            if (obj == NULL) {
-                continue;
-            }
-            funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
-            while (funcdata != NULL) {
-                if (funcdata->arg_dtypes != NULL) {
-                    any_flexible_userloops = 1;
-                    break;
-                }
-                funcdata = funcdata->next;
-            }
-        }
-    }
-
-    if (any_flexible && !any_flexible_userloops && !any_object && nin == 2) {
-        /* Traditionally, we return -2 here (meaning "NotImplemented") anytime
-         * we hit the above condition.
-         *
-         * This condition basically means "we are doomed", b/c the "flexible"
-         * dtypes -- strings and void -- cannot have their own ufunc loops
-         * registered (except via the special "flexible userloops" mechanism),
-         * and they can't be cast to anything except object (and we only cast
-         * to object if any_object is true). So really we should do nothing
-         * here and continue and let the proper error be raised. But, we can't
-         * quite yet, b/c of backcompat.
-         *
-         * Most of the time, this NotImplemented either got returned directly
-         * to the user (who can't do anything useful with it), or got passed
-         * back out of a special function like __mul__. And fortunately, for
-         * almost all special functions, the end result of this was a
-         * TypeError. Which is also what we get if we just continue without
-         * this special case, so this special case is unnecessary.
-         *
-         * The only thing that actually depended on the NotImplemented is
-         * array_richcompare, which did two things with it. First, it needed
-         * to see this NotImplemented in order to implement the special-case
-         * comparisons for
-         *
-         *    string < <= == != >= > string
-         *    void == != void
-         *
-         * Now it checks for those cases first, before trying to call the
-         * ufunc, so that's no problem. What it doesn't handle, though, is
-         * cases like
-         *
-         *    float < string
-         *
-         * or
-         *
-         *    float == void
-         *
-         * For those, it just let the NotImplemented bubble out, and accepted
-         * Python's default handling. And unfortunately, for comparisons,
-         * Python's default is *not* to raise an error. Instead, it returns
-         * something that depends on the operator:
-         *
-         *    ==         return False
-         *    !=         return True
-         *    < <= >= >  Python 2: use "fallback" (= weird and broken) ordering
-         *               Python 3: raise TypeError (hallelujah)
-         *
-         * In most cases this is straightforwardly broken, because comparison
-         * of two arrays should always return an array, and here we end up
-         * returning a scalar. However, there is an exception: if we are
-         * comparing two scalars for equality, then it actually is correct to
-         * return a scalar bool instead of raising an error. If we just
-         * removed this special check entirely, then "np.float64(1) == 'foo'"
-         * would raise an error instead of returning False, which is genuinely
-         * wrong.
-         *
-         * The proper end goal here is:
-         *   1) == and != should be implemented in a proper vectorized way for
-         *      all types. The short-term hack for this is just to add a
-         *      special case to PyUFunc_DefaultLegacyInnerLoopSelector where
-         *      if it can't find a comparison loop for the given types, and
-         *      the ufunc is np.equal or np.not_equal, then it returns a loop
-         *      that just fills the output array with False (resp. True). Then
-         *      array_richcompare could trust that whenever its special cases
-         *      don't apply, simply calling the ufunc will do the right thing,
-         *      even without this special check.
-         *   2) < <= >= > should raise an error if no comparison function can
-         *      be found. array_richcompare already handles all string <>
-         *      string cases, and void dtypes don't have ordering, so again
-         *      this would mean that array_richcompare could simply call the
-         *      ufunc and it would do the right thing (i.e., raise an error),
-         *      again without needing this special check.
-         *
-         * So this means that for the transition period, our goal is:
-         *   == and != on scalars should simply return NotImplemented like
-         *     they always did, since everything ends up working out correctly
-         *     in this case only
-         *   == and != on arrays should issue a FutureWarning and then return
-         *     NotImplemented
-         *   < <= >= > on all flexible dtypes on py2 should raise a
-         *     DeprecationWarning, and then return NotImplemented. On py3 we
-         *     skip the warning, though, b/c it would just be immediately be
-         *     followed by an exception anyway.
-         *
-         * And for all other operations, we let things continue as normal.
-         */
-        /* strcmp() is a hack but I think we can get away with it for this
-         * temporary measure.
-         */
-        if (!strcmp(ufunc_name, "equal") ||
-                !strcmp(ufunc_name, "not_equal")) {
-            /* Warn on non-scalar, return NotImplemented regardless */
-            if (PyArray_NDIM(out_op[0]) != 0 ||
-                    PyArray_NDIM(out_op[1]) != 0) {
-                if (DEPRECATE_FUTUREWARNING(
-                        "elementwise comparison failed; returning scalar "
-                        "instead, but in the future will perform elementwise "
-                        "comparison") < 0) {
-                    goto fail;
-                }
-            }
-            Py_DECREF(out_op[0]);
-            Py_DECREF(out_op[1]);
-            return -2;
-        }
-        else if (!strcmp(ufunc_name, "less") ||
-                 !strcmp(ufunc_name, "less_equal") ||
-                 !strcmp(ufunc_name, "greater") ||
-                 !strcmp(ufunc_name, "greater_equal")) {
-#if !defined(NPY_PY3K)
-            if (DEPRECATE("unorderable dtypes; returning scalar but in "
-                          "the future this will be an error") < 0) {
-                goto fail;
-            }
-#endif
-            Py_DECREF(out_op[0]);
-            Py_DECREF(out_op[1]);
-            return -2;
-        }
     }
 
     /* Get positional output arguments */
@@ -809,253 +960,149 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
     }
 
     /*
-     * Get keyword output and other arguments.
-     * Raise an error if anything else is present in the
-     * keyword dictionary.
+     * If keywords are present, get keyword output and other arguments.
+     * Raise an error if anything else is present in the keyword dictionary.
      */
-    if (kwds != NULL) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwds, &pos, &key, &value)) {
-            Py_ssize_t length = 0;
-            char *str = NULL;
-            int bad_arg = 1;
-
-#if defined(NPY_PY3K)
-            Py_XDECREF(str_key_obj);
-            str_key_obj = PyUnicode_AsASCIIString(key);
-            if (str_key_obj != NULL) {
-                key = str_key_obj;
-            }
-#endif
-
-            if (PyBytes_AsStringAndSize(key, &str, &length) < 0) {
-                PyErr_Clear();
-                PyErr_SetString(PyExc_TypeError, "invalid keyword argument");
+    if (kwds) {
+        PyObject *out_kwd = NULL;
+        PyObject *sig = NULL;
+        static PyObject *kwnames[13] = {NULL};
+        if (kwnames[0] == NULL) {
+            kwnames[0] = npy_um_str_out;
+            kwnames[1] = npy_um_str_where;
+            kwnames[2] = npy_um_str_axes;
+            kwnames[3] = npy_um_str_axis;
+            kwnames[4] = npy_um_str_keepdims;
+            kwnames[5] = npy_um_str_casting;
+            kwnames[6] = npy_um_str_order;
+            kwnames[7] = npy_um_str_dtype;
+            kwnames[8] = npy_um_str_subok;
+            kwnames[9] = npy_um_str_signature;
+            kwnames[10] = npy_um_str_sig;
+            kwnames[11] = npy_um_str_extobj;
+            kwnames[12] = NULL;  /* sentinel */
+        }
+        /*
+         * Parse using converters to calculate outputs
+         * (NULL outputs are treated as indicating a keyword is not allowed).
+         */
+        if (parse_ufunc_keywords(
+                ufunc, kwds, kwnames,
+                _borrowed_reference, &out_kwd,
+                _wheremask_converter, out_wheremask,  /* new reference */
+                _new_reference, out_axes,
+                _new_reference, out_axis,
+                _keepdims_converter, out_keepdims,
+                PyArray_CastingConverter, out_casting,
+                PyArray_OrderConverter, out_order,
+                PyArray_DescrConverter2, &dtype,   /* new reference */
+                _subok_converter, out_subok,
+                _new_reference, out_typetup,
+                _borrowed_reference, &sig,
+                _new_reference, out_extobj) < 0) {
+            goto fail;
+        }
+        /*
+         * Check that outputs were not passed as positional as well,
+         * and that they are either None or an array.
+         */
+        if (out_kwd) {  /* borrowed reference */
+            /*
+             * Output arrays are generally specified as a tuple of arrays
+             * and None, but may be a single array or None for ufuncs
+             * with a single output.
+             */
+            if (nargs > nin) {
+                PyErr_SetString(PyExc_ValueError,
+                                "cannot specify 'out' as both a "
+                                "positional and keyword argument");
                 goto fail;
             }
-
-            switch (str[0]) {
-                case 'a':
-                    /* possible axes argument for generalized ufunc */
-                    if (out_axes != NULL && strcmp(str, "axes") == 0) {
-                        if (out_axis != NULL && *out_axis != NULL) {
-                            PyErr_SetString(PyExc_TypeError,
-                                "cannot specify both 'axis' and 'axes'");
-                            goto fail;
-                        }
-                        Py_INCREF(value);
-                        *out_axes = value;
-                        bad_arg = 0;
-                    }
-                    else if (out_axis != NULL && strcmp(str, "axis") == 0) {
-                        if (out_axes != NULL && *out_axes != NULL) {
-                            PyErr_SetString(PyExc_TypeError,
-                                "cannot specify both 'axis' and 'axes'");
-                            goto fail;
-                        }
-                        Py_INCREF(value);
-                        *out_axis = value;
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'c':
-                    /* Provides a policy for allowed casting */
-                    if (strcmp(str, "casting") == 0) {
-                        if (!PyArray_CastingConverter(value, out_casting)) {
-                            goto fail;
-                        }
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'd':
-                    /* Another way to specify 'sig' */
-                    if (strcmp(str, "dtype") == 0) {
-                        /* Allow this parameter to be None */
-                        PyArray_Descr *dtype;
-                        if (!PyArray_DescrConverter2(value, &dtype)) {
-                            goto fail;
-                        }
-                        if (dtype != NULL) {
-                            if (*out_typetup != NULL) {
-                                PyErr_SetString(PyExc_RuntimeError,
-                                    "cannot specify both 'signature' and 'dtype'");
-                                goto fail;
-                            }
-                            *out_typetup = Py_BuildValue("(N)", dtype);
-                        }
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'e':
-                    /*
-                     * Overrides the global parameters buffer size,
-                     * error mask, and error object
-                     */
-                    if (strcmp(str, "extobj") == 0) {
-                        Py_INCREF(value);
-                        *out_extobj = value;
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'k':
-                    if (out_keepdims != NULL && strcmp(str, "keepdims") == 0) {
-                        if (!PyBool_Check(value)) {
-                            PyErr_SetString(PyExc_TypeError,
-                                        "'keepdims' must be a boolean");
-                            goto fail;
-                        }
-                        *out_keepdims = (value == Py_True);
-                        bad_arg = 0;
+            if (PyTuple_CheckExact(out_kwd)) {
+                if (PyTuple_GET_SIZE(out_kwd) != nout) {
+                    PyErr_SetString(PyExc_ValueError,
+                                    "The 'out' tuple must have exactly "
+                                    "one entry per ufunc output");
+                    goto fail;
+                }
+                /* 'out' must be a tuple of arrays and Nones */
+                for(i = 0; i < nout; ++i) {
+                    PyObject *val = PyTuple_GET_ITEM(out_kwd, i);
+                    if (_set_out_array(val, out_op+nin+i) < 0) {
+                        goto fail;
                     }
-                    break;
-                case 'o':
-                    /*
-                     * Output arrays may be specified as a keyword argument,
-                     * either as a single array or None for single output
-                     * ufuncs, or as a tuple of arrays and Nones.
-                     */
-                    if (strcmp(str, "out") == 0) {
-                        if (nargs > nin) {
-                            PyErr_SetString(PyExc_ValueError,
-                                    "cannot specify 'out' as both a "
-                                    "positional and keyword argument");
-                            goto fail;
-                        }
-                        if (PyTuple_CheckExact(value)) {
-                            if (PyTuple_GET_SIZE(value) != nout) {
-                                PyErr_SetString(PyExc_ValueError,
-                                        "The 'out' tuple must have exactly "
-                                        "one entry per ufunc output");
-                                goto fail;
-                            }
-                            /* 'out' must be a tuple of arrays and Nones */
-                            for(i = 0; i < nout; ++i) {
-                                PyObject *val = PyTuple_GET_ITEM(value, i);
-                                if (_set_out_array(val, out_op+nin+i) < 0) {
-                                    goto fail;
-                                }
-                            }
-                        }
-                        else if (nout == 1) {
-                            /* Can be an array if it only has one output */
-                            if (_set_out_array(value, out_op + nin) < 0) {
-                                goto fail;
-                            }
-                        }
-                        else {
-                            /*
-                             * If the deprecated behavior is ever removed,
-                             * keep only the else branch of this if-else
-                             */
-                            if (PyArray_Check(value) || value == Py_None) {
-                                if (DEPRECATE("passing a single array to the "
-                                              "'out' keyword argument of a "
-                                              "ufunc with\n"
-                                              "more than one output will "
-                                              "result in an error in the "
-                                              "future") < 0) {
-                                    /* The future error message */
-                                    PyErr_SetString(PyExc_TypeError,
+                }
+            }
+            else if (nout == 1) {
+                /* Can be an array if it only has one output */
+                if (_set_out_array(out_kwd, out_op + nin) < 0) {
+                    goto fail;
+                }
+            }
+            else {
+                /*
+                 * If the deprecated behavior is ever removed,
+                 * keep only the else branch of this if-else
+                 */
+                if (PyArray_Check(out_kwd) || out_kwd == Py_None) {
+                    if (DEPRECATE("passing a single array to the "
+                                  "'out' keyword argument of a "
+                                  "ufunc with\n"
+                                  "more than one output will "
+                                  "result in an error in the "
+                                  "future") < 0) {
+                        /* The future error message */
+                        PyErr_SetString(PyExc_TypeError,
                                         "'out' must be a tuple of arrays");
-                                    goto fail;
-                                }
-                                if (_set_out_array(value, out_op+nin) < 0) {
-                                    goto fail;
-                                }
-                            }
-                            else {
-                                PyErr_SetString(PyExc_TypeError,
-                                    nout > 1 ? "'out' must be a tuple "
-                                               "of arrays" :
-                                               "'out' must be an array or a "
-                                               "tuple of a single array");
-                                goto fail;
-                            }
-                        }
-                        bad_arg = 0;
+                        goto fail;
                     }
-                    /* Allows the default output layout to be overridden */
-                    else if (strcmp(str, "order") == 0) {
-                        if (!PyArray_OrderConverter(value, out_order)) {
-                            goto fail;
-                        }
-                        bad_arg = 0;
+                    if (_set_out_array(out_kwd, out_op+nin) < 0) {
+                        goto fail;
                     }
-                    break;
-                case 's':
-                    /* Allows a specific function inner loop to be selected */
-                    if (strcmp(str, "sig") == 0 ||
-                            strcmp(str, "signature") == 0) {
-                        if (has_sig == 1) {
-                            PyErr_SetString(PyExc_ValueError,
+                }
+                else {
+                    PyErr_SetString(PyExc_TypeError,
+                                    nout > 1 ? "'out' must be a tuple "
+                                    "of arrays" :
+                                    "'out' must be an array or a "
+                                    "tuple of a single array");
+                    goto fail;
+                }
+            }
+        }
+        /*
+         * Check we did not get both axis and axes, or multiple ways
+         * to define a signature.
+         */
+        if (out_axes != NULL && out_axis != NULL &&
+            *out_axes != NULL && *out_axis != NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                            "cannot specify both 'axis' and 'axes'");
+            goto fail;
+        }
+        if (sig) {  /* borrowed reference */
+            if (*out_typetup != NULL) {
+                PyErr_SetString(PyExc_ValueError,
                                 "cannot specify both 'sig' and 'signature'");
-                            goto fail;
-                        }
-                        if (*out_typetup != NULL) {
-                            PyErr_SetString(PyExc_RuntimeError,
-                                    "cannot specify both 'signature' and 'dtype'");
-                            goto fail;
-                        }
-                        Py_INCREF(value);
-                        *out_typetup = value;
-                        bad_arg = 0;
-                        has_sig = 1;
-                    }
-                    else if (strcmp(str, "subok") == 0) {
-                        if (!PyBool_Check(value)) {
-                            PyErr_SetString(PyExc_TypeError,
-                                        "'subok' must be a boolean");
-                            goto fail;
-                        }
-                        *out_subok = (value == Py_True);
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'w':
-                    /*
-                     * Provides a boolean array 'where=' mask if
-                     * out_wheremask is supplied.
-                     */
-                    if (out_wheremask != NULL && strcmp(str, "where") == 0) {
-                        PyArray_Descr *dtype;
-                        dtype = PyArray_DescrFromType(NPY_BOOL);
-                        if (dtype == NULL) {
-                            goto fail;
-                        }
-                        if (value == Py_True) {
-                            /*
-                             * Optimization: where=True is the same as no
-                             * where argument. This lets us document it as a
-                             * default argument
-                             */
-                            bad_arg = 0;
-                            break;
-                        }
-                        *out_wheremask = (PyArrayObject *)PyArray_FromAny(
-                                                            value, dtype,
-                                                            0, 0, 0, NULL);
-                        if (*out_wheremask == NULL) {
-                            goto fail;
-                        }
-                        bad_arg = 0;
-                    }
-                    break;
+                goto fail;
             }
-
-            if (bad_arg) {
-                char *format = "'%s' is an invalid keyword to ufunc '%s'";
-                PyErr_Format(PyExc_TypeError, format, str, ufunc_name);
+            Py_INCREF(sig);
+            *out_typetup = sig;
+        }
+        if (dtype) {  /* new reference */
+            if (*out_typetup != NULL) {
+                PyErr_SetString(PyExc_RuntimeError,
+                                "cannot specify both 'signature' and 'dtype'");
                 goto fail;
             }
+            /* Note: "N" uses the reference */
+            *out_typetup = Py_BuildValue("(N)", dtype);
         }
     }
-    Py_XDECREF(str_key_obj);
-
     return 0;
 
 fail:
-    Py_XDECREF(str_key_obj);
+    Py_XDECREF(dtype);
     Py_XDECREF(*out_typetup);
     Py_XDECREF(*out_extobj);
     if (out_wheremask != NULL) {
@@ -1281,7 +1328,6 @@ iterator_loop(PyUFuncObject *ufunc,
 
     PyArrayObject **op_it;
     npy_uint32 iter_flags;
-    int retval;
 
     NPY_BEGIN_THREADS_DEF;
 
@@ -1355,7 +1401,6 @@ iterator_loop(PyUFuncObject *ufunc,
             /* Call the __array_prepare__ functions for the new array */
             if (prepare_ufunc_output(ufunc, &op[nin+i],
                                      arr_prep[i], full_args, i) < 0) {
-                NpyIter_Close(iter);
                 NpyIter_Deallocate(iter);
                 return -1;
             }
@@ -1384,7 +1429,6 @@ iterator_loop(PyUFuncObject *ufunc,
             baseptrs[i] = PyArray_BYTES(op_it[i]);
         }
         if (NpyIter_ResetBasePointers(iter, baseptrs, NULL) != NPY_SUCCEED) {
-            NpyIter_Close(iter);
             NpyIter_Deallocate(iter);
             return -1;
         }
@@ -1392,7 +1436,6 @@ iterator_loop(PyUFuncObject *ufunc,
         /* Get the variables needed for the loop */
         iternext = NpyIter_GetIterNext(iter, NULL);
         if (iternext == NULL) {
-            NpyIter_Close(iter);
             NpyIter_Deallocate(iter);
             return -1;
         }
@@ -1410,9 +1453,7 @@ iterator_loop(PyUFuncObject *ufunc,
 
         NPY_END_THREADS;
     }
-    retval = NpyIter_Close(iter);
-    NpyIter_Deallocate(iter);
-    return retval;
+    return NpyIter_Deallocate(iter);
 }
 
 /*
@@ -1597,7 +1638,7 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
                     PyObject **arr_prep,
                     ufunc_full_args full_args)
 {
-    int retval, i, nin = ufunc->nin, nout = ufunc->nout;
+    int i, nin = ufunc->nin, nout = ufunc->nout;
     int nop = nin + nout;
     npy_uint32 op_flags[NPY_MAXARGS];
     NpyIter *iter;
@@ -1709,7 +1750,6 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
 
         if (prepare_ufunc_output(ufunc, &op_tmp,
                                  arr_prep[i], full_args, i) < 0) {
-            NpyIter_Close(iter);
             NpyIter_Deallocate(iter);
             return -1;
         }
@@ -1720,7 +1760,6 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
                         "The __array_prepare__ functions modified the data "
                         "pointer addresses in an invalid fashion");
             Py_DECREF(op_tmp);
-            NpyIter_Close(iter);
             NpyIter_Deallocate(iter);
             return -1;
         }
@@ -1755,7 +1794,6 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
                         wheremask != NULL ? fixed_strides[nop]
                                           : fixed_strides[nop + nin],
                         &innerloop, &innerloopdata, &needs_api) < 0) {
-            NpyIter_Close(iter);
             NpyIter_Deallocate(iter);
             return -1;
         }
@@ -1763,7 +1801,6 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
         /* Get the variables needed for the loop */
         iternext = NpyIter_GetIterNext(iter, NULL);
         if (iternext == NULL) {
-            NpyIter_Close(iter);
             NpyIter_Deallocate(iter);
             return -1;
         }
@@ -1787,9 +1824,7 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
         NPY_AUXDATA_FREE(innerloopdata);
     }
 
-    retval = NpyIter_Close(iter);
-    NpyIter_Deallocate(iter);
-    return retval;
+    return NpyIter_Deallocate(iter);
 }
 
 static npy_bool
@@ -2300,7 +2335,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     int nin, nout;
     int i, j, idim, nop;
     const char *ufunc_name;
-    int retval = 0, subok = 1;
+    int retval, subok = 1;
     int needs_api = 0;
 
     PyArray_Descr *dtypes[NPY_MAXARGS];
@@ -2809,16 +2844,11 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
         goto fail;
     }
 
-    /* Write back any temporary data from PyArray_SetWritebackIfCopyBase */
-    if (NpyIter_Close(iter) < 0) {
-        goto fail;
-    }
-
     PyArray_free(inner_strides);
-    if (NpyIter_Close(iter) < 0) {
-        goto fail;
+    if (NpyIter_Deallocate(iter) < 0) {
+        retval = -1;
     }
-    NpyIter_Deallocate(iter);
+
     /* The caller takes ownership of all the references in op */
     for (i = 0; i < nop; ++i) {
         Py_XDECREF(dtypes[i]);
@@ -2831,14 +2861,13 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     Py_XDECREF(full_args.in);
     Py_XDECREF(full_args.out);
 
-    NPY_UF_DBG_PRINT("Returning Success\n");
+    NPY_UF_DBG_PRINT1("Returning code %d\n", reval);
 
-    return 0;
+    return retval;
 
 fail:
     NPY_UF_DBG_PRINT1("Returning failure code %d\n", retval);
     PyArray_free(inner_strides);
-    NpyIter_Close(iter);
     NpyIter_Deallocate(iter);
     for (i = 0; i < nop; ++i) {
         Py_XDECREF(op[i]);
@@ -3031,7 +3060,7 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
     Py_XDECREF(full_args.out);
     Py_XDECREF(wheremask);
 
-    NPY_UF_DBG_PRINT("Returning Success\n");
+    NPY_UF_DBG_PRINT("Returning success code 0\n");
 
     return 0;
 
@@ -3722,12 +3751,6 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
     }
 
 finish:
-    if (NpyIter_Close(iter) < 0) {
-        goto fail;
-    }
-    if (NpyIter_Close(iter_inner) < 0) {
-        goto fail;
-    }
     Py_XDECREF(op_dtypes[0]);
     NpyIter_Deallocate(iter);
     NpyIter_Deallocate(iter_inner);
@@ -4110,9 +4133,6 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
     }
 
 finish:
-    if (NpyIter_Close(iter) < 0) {
-        goto fail;
-    }
     Py_XDECREF(op_dtypes[0]);
     NpyIter_Deallocate(iter);
 
@@ -4141,7 +4161,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
     int axes[NPY_MAXDIMS];
     PyObject *axes_in = NULL;
     PyArrayObject *mp = NULL, *ret = NULL;
-    PyObject *op, *res = NULL;
+    PyObject *op;
     PyObject *obj_ind, *context;
     PyArrayObject *indices = NULL;
     PyArray_Descr *otype = NULL;
@@ -4387,25 +4407,31 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
         return NULL;
     }
 
-    /* If an output parameter was provided, don't wrap it */
-    if (out != NULL) {
-        return (PyObject *)ret;
-    }
-
-    if (Py_TYPE(op) != Py_TYPE(ret)) {
-        res = PyObject_CallMethod(op, "__array_wrap__", "O", ret);
-        if (res == NULL) {
-            PyErr_Clear();
-        }
-        else if (res == Py_None) {
-            Py_DECREF(res);
+    /* Wrap and return the output */
+    {
+        /* Find __array_wrap__ - note that these rules are different to the
+         * normal ufunc path
+         */
+        PyObject *wrap;
+        if (out != NULL) {
+            wrap = Py_None;
+            Py_INCREF(wrap);
+        }
+        else if (Py_TYPE(op) != Py_TYPE(ret)) {
+            wrap = PyObject_GetAttr(op, npy_um_str_array_wrap);
+            if (wrap == NULL) {
+                PyErr_Clear();
+            }
+            else if (!PyCallable_Check(wrap)) {
+                Py_DECREF(wrap);
+                wrap = NULL;
+            }
         }
         else {
-            Py_DECREF(ret);
-            return res;
+            wrap = NULL;
         }
+        return _apply_array_wrap(wrap, ret, NULL);
     }
-    return PyArray_Return(ret);
 
 fail:
     Py_XDECREF(otype);
@@ -4413,78 +4439,6 @@ fail:
     return NULL;
 }
 
-/*
- * This function analyzes the input arguments
- * and determines an appropriate __array_wrap__ function to call
- * for the outputs.
- *
- * If an output argument is provided, then it is wrapped
- * with its own __array_wrap__ not with the one determined by
- * the input arguments.
- *
- * if the provided output argument is already an array,
- * the wrapping function is None (which means no wrapping will
- * be done --- not even PyArray_Return).
- *
- * A NULL is placed in output_wrap for outputs that
- * should just have PyArray_Return called.
- */
-static void
-_find_array_wrap(ufunc_full_args args, PyObject *kwds,
-                PyObject **output_wrap, int nin, int nout)
-{
-    int i;
-    PyObject *obj;
-    PyObject *wrap = NULL;
-
-    /*
-     * If a 'subok' parameter is passed and isn't True, don't wrap but put None
-     * into slots with out arguments which means return the out argument
-     */
-    if (kwds != NULL && (obj = PyDict_GetItem(kwds,
-                                              npy_um_str_subok)) != NULL) {
-        if (obj != Py_True) {
-            /* skip search for wrap members */
-            goto handle_out;
-        }
-    }
-
-    /*
-     * Determine the wrapping function given by the input arrays
-     * (could be NULL).
-     */
-    wrap = _find_array_method(args.in, npy_um_str_array_wrap);
-
-    /*
-     * For all the output arrays decide what to do.
-     *
-     * 1) Use the wrap function determined from the input arrays
-     * This is the default if the output array is not
-     * passed in.
-     *
-     * 2) Use the __array_wrap__ method of the output object
-     * passed in. -- this is special cased for
-     * exact ndarray so that no PyArray_Return is
-     * done in that case.
-     */
-handle_out:
-    if (args.out == NULL) {
-        for (i = 0; i < nout; i++) {
-            Py_XINCREF(wrap);
-            output_wrap[i] = wrap;
-        }
-    }
-    else {
-        for (i = 0; i < nout; i++) {
-            output_wrap[i] = _get_output_array_method(
-                PyTuple_GET_ITEM(args.out, i), npy_um_str_array_wrap, wrap);
-        }
-    }
-
-    Py_XDECREF(wrap);
-    return;
-}
-
 
 static PyObject *
 ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
@@ -4507,22 +4461,7 @@ ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 
     errval = PyUFunc_GenericFunction(ufunc, args, kwds, mps);
     if (errval < 0) {
-        if (errval == -1) {
-            return NULL;
-        }
-        else if (ufunc->nin == 2 && ufunc->nout == 1) {
-            /*
-             * For array_richcompare's benefit -- see the long comment in
-             * get_ufunc_arguments.
-             */
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError,
-                            "XX can't happen, please report a bug XX");
-            return NULL;
-        }
+        return NULL;
     }
 
     /* Free the input references */
@@ -4555,42 +4494,20 @@ ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
     /* wrap outputs */
     for (i = 0; i < ufunc->nout; i++) {
         int j = ufunc->nin+i;
-        PyObject *wrap = wraparr[i];
-
-        if (wrap == NULL) {
-            /* default behavior */
-            retobj[i] = PyArray_Return(mps[j]);
-        }
-        else if (wrap == Py_None) {
-            Py_DECREF(wrap);
-            retobj[i] = (PyObject *)mps[j];
-        }
-        else {
-            PyObject *res;
-            PyObject *args_tup;
+        _ufunc_context context;
+        PyObject *wrapped;
 
-            /* Call the method with appropriate context */
-            args_tup = _get_wrap_prepare_args(full_args);
-            if (args_tup == NULL) {
-                goto fail;
-            }
-            res = PyObject_CallFunction(
-                wrap, "O(OOi)", mps[j], ufunc, args_tup, i);
-            Py_DECREF(args_tup);
+        context.ufunc = ufunc;
+        context.args = full_args;
+        context.out_i = i;
 
-            /* Handle __array_wrap__ that does not accept a context argument */
-            if (res == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
-                PyErr_Clear();
-                res = PyObject_CallFunctionObjArgs(wrap, mps[j], NULL);
-            }
-            Py_DECREF(wrap);
-            Py_DECREF(mps[j]);
-            mps[j] = NULL;  /* Prevent fail double-freeing this */
-            if (res == NULL) {
-                goto fail;
-            }
-            retobj[i] = res;
+        wrapped = _apply_array_wrap(wraparr[i], mps[j], &context);
+        mps[j] = NULL;  /* Prevent fail double-freeing this */
+        if (wrapped == NULL) {
+            goto fail;
         }
+
+        retobj[i] = wrapped;
     }
 
     Py_XDECREF(full_args.in);
@@ -5544,7 +5461,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
 
     iternext = NpyIter_GetIterNext(iter_buffer, NULL);
     if (iternext == NULL) {
-        NpyIter_Close(iter_buffer);
         NpyIter_Deallocate(iter_buffer);
         goto fail;
     }
@@ -5614,7 +5530,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         PyErr_SetString(PyExc_ValueError, err_msg);
     }
 
-    NpyIter_Close(iter_buffer);
     NpyIter_Deallocate(iter_buffer);
 
     Py_XDECREF(op2_array);
@@ -5632,7 +5547,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
     }
 
 fail:
-    /* iter_buffer has already been deallocated, don't use NpyIter_Close */
+    /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */
     if (op1_array != (PyArrayObject*)op1) {
         PyArray_DiscardWritebackIfCopy(op1_array);
     }
diff --git a/numpy/core/src/umath/ufunc_object.h b/numpy/core/src/umath/ufunc_object.h
index d6fd3837a..5438270f1 100644
--- a/numpy/core/src/umath/ufunc_object.h
+++ b/numpy/core/src/umath/ufunc_object.h
@@ -10,13 +10,23 @@ ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
 NPY_NO_EXPORT const char*
 ufunc_get_name_cstr(PyUFuncObject *ufunc);
 
-/* interned strings (on umath import) */
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_out;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_subok;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_array_prepare;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_array_wrap;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_array_finalize;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_ufunc;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_pyvals_name;
+/* strings from umathmodule.c that are interned on umath import */
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_out;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_where;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_axes;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_axis;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_keepdims;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_casting;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_order;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_dtype;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_subok;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_signature;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_sig;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_extobj;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_prepare;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_wrap;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_finalize;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_ufunc;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_pyvals_name;
 
 #endif
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 5567b9bbf..9291a5138 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -226,20 +226,40 @@ add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
  *****************************************************************************
  */
 
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_out = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_subok = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_array_prepare = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_array_wrap = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_array_finalize = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_ufunc = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_pyvals_name = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_out = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_where = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_axes = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_axis = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_keepdims = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_casting = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_order = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_dtype = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_subok = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_signature = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_sig = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_extobj = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_prepare = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_wrap = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_finalize = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_ufunc = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_pyvals_name = NULL;
 
 /* intern some strings used in ufuncs */
 static int
 intern_strings(void)
 {
     npy_um_str_out = PyUString_InternFromString("out");
+    npy_um_str_where = PyUString_InternFromString("where");
+    npy_um_str_axes = PyUString_InternFromString("axes");
+    npy_um_str_axis = PyUString_InternFromString("axis");
+    npy_um_str_keepdims = PyUString_InternFromString("keepdims");
+    npy_um_str_casting = PyUString_InternFromString("casting");
+    npy_um_str_order = PyUString_InternFromString("order");
+    npy_um_str_dtype = PyUString_InternFromString("dtype");
     npy_um_str_subok = PyUString_InternFromString("subok");
+    npy_um_str_signature = PyUString_InternFromString("signature");
+    npy_um_str_sig = PyUString_InternFromString("sig");
+    npy_um_str_extobj = PyUString_InternFromString("extobj");
     npy_um_str_array_prepare = PyUString_InternFromString("__array_prepare__");
     npy_um_str_array_wrap = PyUString_InternFromString("__array_wrap__");
     npy_um_str_array_finalize = PyUString_InternFromString("__array_finalize__");
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 60a7c72f7..5d66d963f 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -36,7 +36,7 @@ class _DeprecationTestCase(object):
 
         # Do *not* ignore other DeprecationWarnings. Ignoring warnings
         # can give very confusing results because of
-        # http://bugs.python.org/issue4180 and it is probably simplest to
+        # https://bugs.python.org/issue4180 and it is probably simplest to
         # try to keep the tests cleanly giving only the right warning type.
         # (While checking them set to "error" those are ignored anyway)
         # We still have them show up, because otherwise they would be raised
@@ -190,10 +190,10 @@ class TestComparisonDeprecations(_DeprecationTestCase):
         b = np.array(['a', 'b', 'c'])
         assert_raises(ValueError, lambda x, y: x == y, a, b)
 
-        # The empty list is not cast to string, as this is only to document
-        # that fact (it likely should be changed). This means that the
-        # following works (and returns False) due to dtype mismatch:
-        a == []
+        # The empty list is not cast to string, and this used to pass due
+        # to dtype mismatch; now (2018-06-21) it correctly leads to a
+        # FutureWarning.
+        assert_warns(FutureWarning, lambda: a == [])
 
     def test_void_dtype_equality_failures(self):
         class NotArray(object):
@@ -414,7 +414,7 @@ class TestClassicIntDivision(_DeprecationTestCase):
     """
     See #7949. Deprecate the numeric-style dtypes with -3 flag in python 2
     if used for division
-    List of data types: http://docs.scipy.org/doc/numpy/user/basics.types.html
+    List of data types: https://docs.scipy.org/doc/numpy/user/basics.types.html
     """
     def test_int_dtypes(self):
         #scramble types and do some mix and match testing
@@ -504,3 +504,17 @@ class TestGeneratorSum(_DeprecationTestCase):
     # 2018-02-25, 1.15.0
     def test_generator_sum(self):
         self.assert_deprecated(np.sum, args=((i for i in range(5)),))
+
+
+class TestSctypeNA(_VisibleDeprecationTestCase):
+    # 2018-06-24, 1.16
+    def test_sctypeNA(self):
+        self.assert_deprecated(lambda: np.sctypeNA['?'])
+        self.assert_deprecated(lambda: np.typeNA['?'])
+        self.assert_deprecated(lambda: np.typeNA.get('?'))
+
+
+class TestPositiveOnNonNumerical(_DeprecationTestCase):
+    # 2018-06-28, 1.16.0
+    def test_positive_on_non_number(self):
+        self.assert_deprecated(operator.pos, args=(np.array('foo'),))
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 27fbb10d5..31ef9d609 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -4,6 +4,7 @@ import pickle
 import sys
 import operator
 import pytest
+import ctypes
 
 import numpy as np
 from numpy.core._rational_tests import rational
@@ -728,3 +729,74 @@ def test_dtypes_are_true():
 def test_invalid_dtype_string():
     # test for gh-10440
     assert_raises(TypeError, np.dtype, 'f8,i8,[f8,i8]')
+
+
+class TestFromCTypes(object):
+
+    @staticmethod
+    def check(ctype, dtype):
+        dtype = np.dtype(dtype)
+        assert_equal(np.dtype(ctype), dtype)
+        assert_equal(np.dtype(ctype()), dtype)
+
+    def test_array(self):
+        c8 = ctypes.c_uint8
+        self.check(     3 * c8,  (np.uint8, (3,)))
+        self.check(     1 * c8,  (np.uint8, (1,)))
+        self.check(     0 * c8,  (np.uint8, (0,)))
+        self.check(1 * (3 * c8), ((np.uint8, (3,)), (1,)))
+        self.check(3 * (1 * c8), ((np.uint8, (1,)), (3,)))
+
+    def test_padded_structure(self):
+        class PaddedStruct(ctypes.Structure):
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16)
+            ]
+        expected = np.dtype([
+            ('a', np.uint8),
+            ('b', np.uint16)
+        ], align=True)
+        self.check(PaddedStruct, expected)
+
+    @pytest.mark.xfail(reason="_pack_ is ignored - see gh-11651")
+    def test_packed_structure(self):
+        class PackedStructure(ctypes.Structure):
+            _pack_ = 1
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16)
+            ]
+        expected = np.dtype([
+            ('a', np.uint8),
+            ('b', np.uint16)
+        ])
+        self.check(PackedStructure, expected)
+
+    @pytest.mark.xfail(sys.byteorder != 'little',
+        reason="non-native endianness does not work - see gh-10533")
+    def test_little_endian_structure(self):
+        class PaddedStruct(ctypes.LittleEndianStructure):
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16)
+            ]
+        expected = np.dtype([
+            ('a', '<B'),
+            ('b', '<H')
+        ], align=True)
+        self.check(PaddedStruct, expected)
+
+    @pytest.mark.xfail(sys.byteorder != 'big',
+        reason="non-native endianness does not work - see gh-10533")
+    def test_big_endian_structure(self):
+        class PaddedStruct(ctypes.BigEndianStructure):
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16)
+            ]
+        expected = np.dtype([
+            ('a', '>B'),
+            ('b', '>H')
+        ], align=True)
+        self.check(PaddedStruct, expected)
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index 63e75ff7a..8ce374a75 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -16,7 +16,7 @@ for size, char in zip(sizes, chars):
     global_size_dict[char] = size
 
 
-class TestEinSum(object):
+class TestEinsum(object):
     def test_einsum_errors(self):
         for do_opt in [True, False]:
             # Need enough arguments
@@ -614,7 +614,7 @@ class TestEinSum(object):
         np.einsum(a, [0, 51], b, [51, 2], [0, 2], optimize=False)
         assert_raises(ValueError, lambda: np.einsum(a, [0, 52], b, [52, 2], [0, 2], optimize=False))
         assert_raises(ValueError, lambda: np.einsum(a, [-1, 5], b, [5, 2], [-1, 2], optimize=False))
-        
+
     def test_einsum_broadcast(self):
         # Issue #2455 change in handling ellipsis
         # remove the 'middle broadcast' error
@@ -730,19 +730,27 @@ class TestEinSum(object):
         res = np.einsum('...ij,...jk->...ik', a, a, out=out)
         assert_equal(res, tgt)
 
-    def optimize_compare(self, string):
+    def test_out_is_res(self):
+        a = np.arange(9).reshape(3, 3)
+        res = np.einsum('...ij,...jk->...ik', a, a, out=a)
+        assert res is a
+
+    def optimize_compare(self, subscripts, operands=None):
         # Tests all paths of the optimization function against
         # conventional einsum
-        operands = [string]
-        terms = string.split('->')[0].split(',')
-        for term in terms:
-            dims = [global_size_dict[x] for x in term]
-            operands.append(np.random.rand(*dims))
-
-        noopt = np.einsum(*operands, optimize=False)
-        opt = np.einsum(*operands, optimize='greedy')
+        if operands is None:
+            args = [subscripts]
+            terms = subscripts.split('->')[0].split(',')
+            for term in terms:
+                dims = [global_size_dict[x] for x in term]
+                args.append(np.random.rand(*dims))
+        else:
+            args = [subscripts] + operands
+
+        noopt = np.einsum(*args, optimize=False)
+        opt = np.einsum(*args, optimize='greedy')
         assert_almost_equal(opt, noopt)
-        opt = np.einsum(*operands, optimize='optimal')
+        opt = np.einsum(*args, optimize='optimal')
         assert_almost_equal(opt, noopt)
 
     def test_hadamard_like_products(self):
@@ -828,8 +836,28 @@ class TestEinSum(object):
         b = np.einsum('bbcdc->d', a)
         assert_equal(b, [12])
 
+    def test_broadcasting_dot_cases(self):
+        # Ensures broadcasting cases are not mistaken for GEMM
 
-class TestEinSumPath(object):
+        a = np.random.rand(1, 5, 4)
+        b = np.random.rand(4, 6)
+        c = np.random.rand(5, 6)
+        d = np.random.rand(10)
+
+        self.optimize_compare('ijk,kl,jl', operands=[a, b, c])
+        self.optimize_compare('ijk,kl,jl,i->i', operands=[a, b, c, d])
+
+        e = np.random.rand(1, 1, 5, 4)
+        f = np.random.rand(7, 7)
+        self.optimize_compare('abjk,kl,jl', operands=[e, b, c])
+        self.optimize_compare('abjk,kl,jl,ab->ab', operands=[e, b, c, f])
+
+        # Edge case found in gh-11308
+        g = np.arange(64).reshape(2, 4, 8)
+        self.optimize_compare('obk,ijk->ioj', operands=[g, g])
+
+
+class TestEinsumPath(object):
     def build_operands(self, string, size_dict=global_size_dict):
 
         # Builds views based off initial operands
@@ -875,7 +903,7 @@ class TestEinSumPath(object):
         long_test1 = self.build_operands('acdf,jbje,gihb,hfac,gfac,gifabc,hfac')
         path, path_str = np.einsum_path(*long_test1, optimize='greedy')
         self.assert_path_equal(path, ['einsum_path',
-                                      (1, 4), (2, 4), (1, 4), (1, 3), (1, 2), (0, 1)])
+                                      (3, 6), (3, 4), (2, 4), (2, 3), (0, 2), (0, 1)])
 
         path, path_str = np.einsum_path(*long_test1, optimize='optimal')
         self.assert_path_equal(path, ['einsum_path',
@@ -884,10 +912,12 @@ class TestEinSumPath(object):
         # Long test 2
         long_test2 = self.build_operands('chd,bde,agbc,hiad,bdi,cgh,agdb')
         path, path_str = np.einsum_path(*long_test2, optimize='greedy')
+        print(path)
         self.assert_path_equal(path, ['einsum_path',
                                       (3, 4), (0, 3), (3, 4), (1, 3), (1, 2), (0, 1)])
 
         path, path_str = np.einsum_path(*long_test2, optimize='optimal')
+        print(path)
         self.assert_path_equal(path, ['einsum_path',
                                       (0, 5), (1, 4), (3, 4), (1, 3), (1, 2), (0, 1)])
 
@@ -921,7 +951,7 @@ class TestEinSumPath(object):
         # Edge test4
         edge_test4 = self.build_operands('dcc,fce,ea,dbf->ab')
         path, path_str = np.einsum_path(*edge_test4, optimize='greedy')
-        self.assert_path_equal(path, ['einsum_path', (0, 3), (0, 2), (0, 1)])
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 1), (0, 1)])
 
         path, path_str = np.einsum_path(*edge_test4, optimize='optimal')
         self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 2), (0, 1)])
@@ -944,7 +974,7 @@ class TestEinSumPath(object):
         self.assert_path_equal(path, ['einsum_path', (0, 1, 2, 3)])
 
         path, path_str = np.einsum_path(*path_test, optimize=True)
-        self.assert_path_equal(path, ['einsum_path', (0, 3), (0, 2), (0, 1)])
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 1), (0, 1)])
 
         exp_path = ['einsum_path', (0, 2), (0, 2), (0, 1)]
         path, path_str = np.einsum_path(*path_test, optimize=exp_path)
@@ -961,3 +991,14 @@ class TestEinSumPath(object):
         for sp in itertools.product(['', ' '], repeat=4):
             # no error for any spacing
             np.einsum('{}...a{}->{}...a{}'.format(*sp), arr)
+
+def test_overlap():
+    a = np.arange(9, dtype=int).reshape(3, 3)
+    b = np.arange(9, dtype=int).reshape(3, 3)
+    d = np.dot(a, b)
+    # sanity check
+    c = np.einsum('ij,jk->ik', a, b)
+    assert_equal(c, d)
+    #gh-10080, out overlaps one of the operands
+    c = np.einsum('ij,jk->ik', a, b, out=b)
+    assert_equal(c, d)
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index 88f5deabc..276cd9f93 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -329,6 +329,21 @@ class TestIndexing(object):
         assert_raises(IndexError, a.__getitem__, ind)
         assert_raises(IndexError, a.__setitem__, ind, 0)
 
+    def test_trivial_fancy_not_possible(self):
+        # Test that the fast path for trivial assignment is not incorrectly
+        # used when the index is not contiguous or 1D, see also gh-11467.
+        a = np.arange(6)
+        idx = np.arange(6, dtype=np.intp).reshape(2, 1, 3)[:, :, 0]
+        assert_array_equal(a[idx], idx)
+
+        # this case must not go into the fast path, note that idx is
+        # a non-contiuguous none 1D array here.
+        a[idx] = -1
+        res = np.arange(6)
+        res[0] = -1
+        res[3] = -1
+        assert_array_equal(a, res)
+
     def test_nonbaseclass_values(self):
         class SubClass(np.ndarray):
             def __array_finalize__(self, old):
@@ -833,7 +848,10 @@ class TestMultiIndexingAutomated(object):
                 # is not safe. It rejects np.array([1., 2.]) but not
                 # [1., 2.] as index (same for ie. np.take).
                 # (Note the importance of empty lists if changing this here)
-                indx = np.array(indx, dtype=np.intp)
+                try:
+                    indx = np.array(indx, dtype=np.intp)
+                except ValueError:
+                    raise IndexError
                 in_indices[i] = indx
             elif indx.dtype.kind != 'b' and indx.dtype.kind != 'i':
                 raise IndexError('arrays used as indices must be of '
@@ -986,9 +1004,13 @@ class TestMultiIndexingAutomated(object):
                     # Maybe never happens...
                     raise ValueError
                 arr = arr.take(mi.ravel(), axis=ax)
-                arr = arr.reshape((arr.shape[:ax]
-                                    + mi.shape
-                                    + arr.shape[ax+1:]))
+                try:
+                    arr = arr.reshape((arr.shape[:ax]
+                                        + mi.shape
+                                        + arr.shape[ax+1:]))
+                except ValueError:
+                    # too many dimensions, probably
+                    raise IndexError
                 ax += mi.ndim
                 continue
 
@@ -1014,8 +1036,8 @@ class TestMultiIndexingAutomated(object):
         except Exception as e:
             if HAS_REFCOUNT:
                 prev_refcount = sys.getrefcount(arr)
-            assert_raises(Exception, arr.__getitem__, index)
-            assert_raises(Exception, arr.__setitem__, index, 0)
+            assert_raises(type(e), arr.__getitem__, index)
+            assert_raises(type(e), arr.__setitem__, index, 0)
             if HAS_REFCOUNT:
                 assert_equal(prev_refcount, sys.getrefcount(arr))
             return
@@ -1038,8 +1060,8 @@ class TestMultiIndexingAutomated(object):
         except Exception as e:
             if HAS_REFCOUNT:
                 prev_refcount = sys.getrefcount(arr)
-            assert_raises(Exception, arr.__getitem__, index)
-            assert_raises(Exception, arr.__setitem__, index, 0)
+            assert_raises(type(e), arr.__getitem__, index)
+            assert_raises(type(e), arr.__setitem__, index, 0)
             if HAS_REFCOUNT:
                 assert_equal(prev_refcount, sys.getrefcount(arr))
             return
@@ -1127,10 +1149,8 @@ class TestMultiIndexingAutomated(object):
 
     def test_1d(self):
         a = np.arange(10)
-        with warnings.catch_warnings():
-            warnings.filterwarnings('error', '', np.VisibleDeprecationWarning)
-            for index in self.complex_indices:
-                self._check_single_index(a, index)
+        for index in self.complex_indices:
+            self._check_single_index(a, index)
 
 class TestFloatNonIntegerArgument(object):
     """
diff --git a/numpy/core/tests/test_memmap.py b/numpy/core/tests/test_memmap.py
index 6c88a9c2c..59ca28324 100644
--- a/numpy/core/tests/test_memmap.py
+++ b/numpy/core/tests/test_memmap.py
@@ -196,3 +196,8 @@ class TestMemmap(object):
         offset = mmap.ALLOCATIONGRANULARITY + 1
         fp = memmap(self.tmpfp, shape=size, mode='w+', offset=offset)
         assert_(fp.offset == offset)
+
+    def test_no_shape(self):
+        self.tmpfp.write(b'a'*16)
+        mm = memmap(self.tmpfp, dtype='float64')
+        assert_equal(mm.shape, (2,))
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b13e48fdc..1511f5b6b 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -44,7 +44,7 @@ from datetime import timedelta, datetime
 if sys.version_info[:2] > (3, 2):
     # In Python 3.3 the representation of empty shape, strides and sub-offsets
     # is an empty tuple instead of None.
-    # http://docs.python.org/dev/whatsnew/3.3.html#api-changes
+    # https://docs.python.org/dev/whatsnew/3.3.html#api-changes
     EMPTY = ()
 else:
     EMPTY = None
@@ -688,6 +688,9 @@ class TestScalarIndexing(object):
 
 
 class TestCreation(object):
+    """
+    Test the np.array constructor
+    """
     def test_from_attribute(self):
         class x(object):
             def __array__(self, dtype=None):
@@ -903,6 +906,34 @@ class TestCreation(object):
             assert_raises(ValueError, np.ndarray, buffer=buf, strides=(0,),
                           shape=(max_bytes//itemsize + 1,), dtype=dtype)
 
+    def test_jagged_ndim_object(self):
+        # Lists of mismatching depths are treated as object arrays
+        a = np.array([[1], 2, 3])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+        a = np.array([1, [2], 3])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+        a = np.array([1, 2, [3]])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+    def test_jagged_shape_object(self):
+        # The jagged dimension of a list is turned into an object array
+        a = np.array([[1, 1], [2], [3]])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+        a = np.array([[1], [2, 2], [3]])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
+        a = np.array([[1], [2], [3, 3]])
+        assert_equal(a.shape, (3,))
+        assert_equal(a.dtype, object)
+
 
 class TestStructured(object):
     def test_subarray_field_access(self):
@@ -3383,6 +3414,16 @@ class TestBinop(object):
         assert_equal(obj_arr ** -1, pow_for(-1, obj_arr))
         assert_equal(obj_arr ** 2, pow_for(2, obj_arr))
 
+    def test_pos_array_ufunc_override(self):
+        class A(np.ndarray):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+                return getattr(ufunc, method)(*[i.view(np.ndarray) for
+                                                i in inputs], **kwargs)
+        tst = np.array('foo').view(A)
+        with assert_raises(TypeError):
+            +tst
+
+
 class TestTemporaryElide(object):
     # elision is only triggered on relatively large arrays
 
@@ -4796,9 +4837,25 @@ class TestRecord(object):
             fn2 = func('f2')
             b[fn2] = 3
 
-            assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
-            assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
-            assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+            # In 1.16 code below can be replaced by:
+            # assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
+            # assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
+            # assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+            with suppress_warnings() as sup:
+                sup.filter(FutureWarning,
+                           ".* selecting multiple fields .*")
+
+                assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
+                assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
+                assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+                # view of subfield view/copy
+                assert_equal(b[['f1', 'f2']][0].view(('i4', 2)).tolist(),
+                             (2, 3))
+                assert_equal(b[['f2', 'f1']][0].view(('i4', 2)).tolist(),
+                             (3, 2))
+                view_dtype = [('f1', 'i4'), ('f3', [('', 'i4')])]
+                assert_equal(b[['f1', 'f3']][0].view(view_dtype).tolist(),
+                             (2, (1,)))
 
         # non-ascii unicode field indexing is well behaved
         if not is_py3:
@@ -4808,6 +4865,51 @@ class TestRecord(object):
             assert_raises(ValueError, a.__setitem__, u'\u03e0', 1)
             assert_raises(ValueError, a.__getitem__, u'\u03e0')
 
+    # can be removed in 1.16
+    def test_field_names_deprecation(self):
+
+        def collect_warnings(f, *args, **kwargs):
+            with warnings.catch_warnings(record=True) as log:
+                warnings.simplefilter("always")
+                f(*args, **kwargs)
+            return [w.category for w in log]
+
+        a = np.zeros((1,), dtype=[('f1', 'i4'),
+                                  ('f2', 'i4'),
+                                  ('f3', [('sf1', 'i4')])])
+        a['f1'][0] = 1
+        a['f2'][0] = 2
+        a['f3'][0] = (3,)
+        b = np.zeros((1,), dtype=[('f1', 'i4'),
+                                  ('f2', 'i4'),
+                                  ('f3', [('sf1', 'i4')])])
+        b['f1'][0] = 1
+        b['f2'][0] = 2
+        b['f3'][0] = (3,)
+
+        # All the different functions raise a warning, but not an error
+        assert_equal(collect_warnings(a[['f1', 'f2']].__setitem__, 0, (10, 20)),
+                     [FutureWarning])
+        # For <=1.12 a is not modified, but it will be in 1.13
+        assert_equal(a, b)
+
+        # Views also warn
+        subset = a[['f1', 'f2']]
+        subset_view = subset.view()
+        assert_equal(collect_warnings(subset_view['f1'].__setitem__, 0, 10),
+                     [FutureWarning])
+        # But the write goes through:
+        assert_equal(subset['f1'][0], 10)
+        # Only one warning per multiple field indexing, though (even if there
+        # are multiple views involved):
+        assert_equal(collect_warnings(subset['f1'].__setitem__, 0, 10), [])
+
+        # make sure views of a multi-field index warn too
+        c = np.zeros(3, dtype='i8,i8,i8')
+        assert_equal(collect_warnings(c[['f0', 'f2']].view, 'i8,i8'),
+                     [FutureWarning])
+
+
     def test_record_hash(self):
         a = np.array([(1, 2), (1, 2)], dtype='i1,i2')
         a.flags.writeable = False
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index a0096efdb..13bc6b34a 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -2830,10 +2830,6 @@ def test_writebacks():
     x[:] = 123 # x.data still valid
     assert_equal(au, 6) # but not connected to au
 
-    do_close = 1
-    # test like above, only in C, and with an option to skip the NpyIter_Close
-    _multiarray_tests.test_nditer_writeback(3, do_close, au, op_dtypes=[np.dtype('f4')])
-    assert_equal(au, 3)
     it = nditer(au, [],
                  [['readwrite', 'updateifcopy']],
                  casting='equiv', op_dtypes=[np.dtype('f4')])
@@ -2862,7 +2858,7 @@ def test_writebacks():
             x[...] = 123
     # make sure we cannot reenter the closed iterator
     enter = it.__enter__
-    assert_raises(ValueError, enter)
+    assert_raises(RuntimeError, enter)
 
 def test_close_equivalent():
     ''' using a context amanger and using nditer.close are equivalent
@@ -2897,12 +2893,13 @@ def test_close_raises():
     assert_raises(StopIteration, next, it)
     assert_raises(ValueError, getattr, it, 'operands')
 
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
 def test_warn_noclose():
     a = np.arange(6, dtype='f4')
     au = a.byteswap().newbyteorder()
-    do_close = 0
     with suppress_warnings() as sup:
         sup.record(RuntimeWarning)
-        # test like above, only in C, and with an option to skip the NpyIter_Close
-        _multiarray_tests.test_nditer_writeback(3, do_close, au, op_dtypes=[np.dtype('f4')])
+        it = np.nditer(au, [], [['readwrite', 'updateifcopy']],
+                        casting='equiv', op_dtypes=[np.dtype('f4')])
+        del it
         assert len(sup.log) == 1
diff --git a/numpy/core/tests/test_numerictypes.py b/numpy/core/tests/test_numerictypes.py
index cdf1b0490..4c3cc6c9e 100644
--- a/numpy/core/tests/test_numerictypes.py
+++ b/numpy/core/tests/test_numerictypes.py
@@ -406,3 +406,9 @@ class TestIsSubDType(object):
         for w1, w2 in itertools.product(self.wrappers, repeat=2):
             assert_(not np.issubdtype(w1(np.float32), w2(np.float64)))
             assert_(not np.issubdtype(w1(np.float64), w2(np.float32)))
+
+
+def TestSctypeDict(object):
+    def test_longdouble(self):
+        assert_(np.sctypeDict['f8'] is not np.longdouble)
+        assert_(np.sctypeDict['c16'] is not np.clongdouble)
diff --git a/numpy/core/tests/test_records.py b/numpy/core/tests/test_records.py
index 9aeb93f74..d7c7d16e3 100644
--- a/numpy/core/tests/test_records.py
+++ b/numpy/core/tests/test_records.py
@@ -11,6 +11,7 @@ import pickle
 import warnings
 import textwrap
 from os import path
+import pytest
 
 import numpy as np
 from numpy.testing import (
@@ -360,6 +361,7 @@ class TestRecord(object):
         with assert_raises(ValueError):
             r.setfield([2,3], *r.dtype.fields['f'])
 
+    @pytest.mark.xfail(reason="See gh-10411, becomes real error in 1.16")
     def test_out_of_order_fields(self):
         # names in the same order, padding added to descr
         x = self.data[['col1', 'col2']]
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index f8f75d9ea..5f4410d54 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -46,9 +46,11 @@ class TestRegression(object):
         assert_array_equal(a, b)
 
     def test_typeNA(self):
-        # Ticket #31
-        assert_equal(np.typeNA[np.int64], 'Int64')
-        assert_equal(np.typeNA[np.uint64], 'UInt64')
+        # Issue gh-515 
+        with suppress_warnings() as sup:
+            sup.filter(np.VisibleDeprecationWarning)
+            assert_equal(np.typeNA[np.int64], 'Int64')
+            assert_equal(np.typeNA[np.uint64], 'UInt64')
 
     def test_dtype_names(self):
         # Ticket #35
@@ -2375,3 +2377,31 @@ class TestRegression(object):
             structure = np.array([1], dtype=[(('x', 'X'), np.object_)])
             structure[0]['x'] = np.array([2])
             gc.collect()
+
+    def test_dtype_scalar_squeeze(self):
+        # gh-11384
+        values = {
+            'S': b"a",
+            'M': "2018-06-20",
+        }
+        for ch in np.typecodes['All']:
+            if ch in 'O':
+                continue
+            sctype = np.dtype(ch).type
+            scvalue = sctype(values.get(ch, 3))
+            for axis in [None, ()]:
+                squeezed = scvalue.squeeze(axis=axis)
+                assert_equal(squeezed, scvalue)
+                assert_equal(type(squeezed), type(scvalue))
+
+    def test_field_access_by_title(self):
+        # gh-11507
+        s = 'Some long field name'
+        if HAS_REFCOUNT:
+            base = sys.getrefcount(s)
+        t = np.dtype([((s, 'f1'), np.float64)])
+        data = np.zeros(10, t)
+        for i in range(10):
+            v = str(data[['f1']])
+            if HAS_REFCOUNT:
+                assert_(base <= sys.getrefcount(s))
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index ab2ef5ce6..a55f06b69 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -136,7 +136,7 @@ class TestPower(object):
         # 1 ** -1 possible special case
         base = [np.array(1, dt)[()] for dt in 'bhilqBHILQ']
         for i1, i2 in itertools.product(base, exp):
-            if i1.dtype.name != 'uint64':
+            if i1.dtype != np.uint64:
                 assert_raises(ValueError, operator.pow, i1, i2)
             else:
                 res = operator.pow(i1, i2)
@@ -146,7 +146,7 @@ class TestPower(object):
         # -1 ** -1 possible special case
         base = [np.array(-1, dt)[()] for dt in 'bhilq']
         for i1, i2 in itertools.product(base, exp):
-            if i1.dtype.name != 'uint64':
+            if i1.dtype != np.uint64:
                 assert_raises(ValueError, operator.pow, i1, i2)
             else:
                 res = operator.pow(i1, i2)
@@ -156,7 +156,7 @@ class TestPower(object):
         # 2 ** -1 perhaps generic
         base = [np.array(2, dt)[()] for dt in 'bhilqBHILQ']
         for i1, i2 in itertools.product(base, exp):
-            if i1.dtype.name != 'uint64':
+            if i1.dtype != np.uint64:
                 assert_raises(ValueError, operator.pow, i1, i2)
             else:
                 res = operator.pow(i1, i2)
@@ -519,7 +519,7 @@ class TestRepr(object):
         storage_bytes = np.dtype(t).itemsize*8
         # could add some more types to the list below
         for which in ['small denorm', 'small norm']:
-            # Values from http://en.wikipedia.org/wiki/IEEE_754
+            # Values from https://en.wikipedia.org/wiki/IEEE_754
             constr = np.array([0x00]*storage_bytes, dtype=np.uint8)
             if which == 'small denorm':
                 byte = last_fraction_bit_idx // 8
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index ef9ced354..0e564e305 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1643,6 +1643,16 @@ class TestUfunc(object):
         target = np.array([ True, False, False, False], dtype=bool)
         assert_equal(np.all(target == (mra == ra[0])), True)
 
+    def test_scalar_equal(self):
+        # Scalar comparisons should always work, without deprecation warnings.
+        # even when the ufunc fails.
+        a = np.array(0.)
+        b = np.array('a')
+        assert_(a != b)
+        assert_(b != a)
+        assert_(not (a == b))
+        assert_(not (b == a))
+
     def test_NotImplemented_not_returned(self):
         # See gh-5964 and gh-2091. Some of these functions are not operator
         # related and were fixed for other reasons in the past.
@@ -1652,17 +1662,16 @@ class TestUfunc(object):
             np.bitwise_xor, np.left_shift, np.right_shift, np.fmax,
             np.fmin, np.fmod, np.hypot, np.logaddexp, np.logaddexp2,
             np.logical_and, np.logical_or, np.logical_xor, np.maximum,
-            np.minimum, np.mod
-            ]
-
-        # These functions still return NotImplemented. Will be fixed in
-        # future.
-        # bad = [np.greater, np.greater_equal, np.less, np.less_equal, np.not_equal]
+            np.minimum, np.mod,
+            np.greater, np.greater_equal, np.less, np.less_equal,
+            np.equal, np.not_equal]
 
         a = np.array('1')
         b = 1
+        c = np.array([1., 2.])
         for f in binary_funcs:
             assert_raises(TypeError, f, a, b)
+            assert_raises(TypeError, f, c, a)
 
     def test_reduce_noncontig_output(self):
         # Check that reduction deals with non-contiguous output arrays
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 4772913be..85c9a4929 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -14,7 +14,7 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_allclose, assert_no_warnings, suppress_warnings,
-    _gen_alignment_data,
+    _gen_alignment_data, assert_warns
     )
 
 
@@ -1339,6 +1339,10 @@ class TestMinMax(object):
                     assert_equal(np.min(r), np.nan)
                 assert_equal(len(sup.log), n)
 
+    def test_minimize_warns(self):
+        # gh 11589
+        assert_warns(RuntimeWarning, np.minimum, np.nan, 1)
+
 
 class TestAbsoluteNegative(object):
     def test_abs_neg_blocked(self):
@@ -1568,13 +1572,14 @@ class TestSpecialMethods(object):
 
         class A(object):
             def __array__(self):
-                return np.zeros(1)
+                return np.zeros(2)
 
             def __array_wrap__(self, arr, context):
                 raise RuntimeError
 
         a = A()
         assert_raises(RuntimeError, ncu.maximum, a, a)
+        assert_raises(RuntimeError, ncu.maximum.reduce, a)
 
     def test_failing_out_wrap(self):
 
@@ -1745,18 +1750,22 @@ class TestSpecialMethods(object):
                 return "B"
 
         class C(object):
+            def __init__(self):
+                self.count = 0
+
             def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                self.count += 1
                 return NotImplemented
 
         class CSub(C):
             def __array_ufunc__(self, func, method, *inputs, **kwargs):
+                self.count += 1
                 return NotImplemented
 
         a = A()
         a_sub = ASub()
         b = B()
         c = C()
-        c_sub = CSub()
 
         # Standard
         res = np.multiply(a, a_sub)
@@ -1767,11 +1776,27 @@ class TestSpecialMethods(object):
         # With 1 NotImplemented
         res = np.multiply(c, a)
         assert_equal(res, "A")
+        assert_equal(c.count, 1)
+        # Check our counter works, so we can trust tests below.
+        res = np.multiply(c, a)
+        assert_equal(c.count, 2)
 
         # Both NotImplemented.
+        c = C()
+        c_sub = CSub()
         assert_raises(TypeError, np.multiply, c, c_sub)
+        assert_equal(c.count, 1)
+        assert_equal(c_sub.count, 1)
+        c.count = c_sub.count = 0
         assert_raises(TypeError, np.multiply, c_sub, c)
+        assert_equal(c.count, 1)
+        assert_equal(c_sub.count, 1)
+        c.count = 0
+        assert_raises(TypeError, np.multiply, c, c)
+        assert_equal(c.count, 1)
+        c.count = 0
         assert_raises(TypeError, np.multiply, 2, c)
+        assert_equal(c.count, 1)
 
         # Ternary testing.
         assert_equal(three_mul_ufunc(a, 1, 2), "A")
@@ -1783,11 +1808,19 @@ class TestSpecialMethods(object):
         assert_equal(three_mul_ufunc(a, 2, b), "A")
         assert_equal(three_mul_ufunc(a, 2, a_sub), "ASub")
         assert_equal(three_mul_ufunc(a, a_sub, 3), "ASub")
+        c.count = 0
         assert_equal(three_mul_ufunc(c, a_sub, 3), "ASub")
+        assert_equal(c.count, 1)
+        c.count = 0
         assert_equal(three_mul_ufunc(1, a_sub, c), "ASub")
+        assert_equal(c.count, 0)
 
+        c.count = 0
         assert_equal(three_mul_ufunc(a, b, c), "A")
+        assert_equal(c.count, 0)
+        c_sub.count = 0
         assert_equal(three_mul_ufunc(a, b, c_sub), "A")
+        assert_equal(c_sub.count, 0)
         assert_equal(three_mul_ufunc(1, 2, b), "B")
 
         assert_raises(TypeError, three_mul_ufunc, 1, 2, c)
@@ -1806,9 +1839,25 @@ class TestSpecialMethods(object):
         assert_equal(four_mul_ufunc(a_sub, 1, 2, a), "ASub")
         assert_equal(four_mul_ufunc(a, 1, 2, a_sub), "ASub")
 
+        c = C()
+        c_sub = CSub()
         assert_raises(TypeError, four_mul_ufunc, 1, 2, 3, c)
+        assert_equal(c.count, 1)
+        c.count = 0
         assert_raises(TypeError, four_mul_ufunc, 1, 2, c_sub, c)
-        assert_raises(TypeError, four_mul_ufunc, 1, c, c_sub, c)
+        assert_equal(c_sub.count, 1)
+        assert_equal(c.count, 1)
+        c2 = C()
+        c.count = c_sub.count = 0
+        assert_raises(TypeError, four_mul_ufunc, 1, c, c_sub, c2)
+        assert_equal(c_sub.count, 1)
+        assert_equal(c.count, 1)
+        assert_equal(c2.count, 0)
+        c.count = c2.count = c_sub.count = 0
+        assert_raises(TypeError, four_mul_ufunc, c2, c, c_sub, c)
+        assert_equal(c_sub.count, 1)
+        assert_equal(c.count, 0)
+        assert_equal(c2.count, 1)
 
     def test_ufunc_override_methods(self):
 
diff --git a/numpy/ctypeslib.py b/numpy/ctypeslib.py
index 9d71adbdb..329c7a280 100644
--- a/numpy/ctypeslib.py
+++ b/numpy/ctypeslib.py
@@ -12,7 +12,7 @@ as_array : Create an ndarray from a ctypes array.
 
 References
 ----------
-.. [1] "SciPy Cookbook: ctypes", http://www.scipy.org/Cookbook/Ctypes
+.. [1] "SciPy Cookbook: ctypes", https://scipy-cookbook.readthedocs.io/items/Ctypes.html
 
 Examples
 --------
diff --git a/numpy/distutils/__init__.py b/numpy/distutils/__init__.py
index b794bebd7..8dd326920 100644
--- a/numpy/distutils/__init__.py
+++ b/numpy/distutils/__init__.py
@@ -17,7 +17,7 @@ try:
     # Normally numpy is installed if the above import works, but an interrupted
     # in-place build could also have left a __config__.py.  In that case the
     # next import may still fail, so keep it inside the try block.
-    from numpy.testing._private.pytesttester import PytestTester
+    from numpy._pytesttester import PytestTester
     test = PytestTester(__name__)
     del PytestTester
 except ImportError:
diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py
index c926e7378..3bd8057b4 100644
--- a/numpy/distutils/fcompiler/__init__.py
+++ b/numpy/distutils/fcompiler/__init__.py
@@ -35,10 +35,11 @@ from numpy.distutils.ccompiler import CCompiler, gen_lib_options
 from numpy.distutils import log
 from numpy.distutils.misc_util import is_string, all_strings, is_sequence, \
     make_temp_file, get_shared_lib_extension
-from numpy.distutils.environment import EnvironmentConfig
 from numpy.distutils.exec_command import find_executable
 from numpy.distutils.compat import get_exception
 
+from .environment import EnvironmentConfig
+
 __metaclass__ = type
 
 class CompilerNotFound(Exception):
@@ -91,7 +92,7 @@ class FCompiler(CCompiler):
 
     # These are the environment variables and distutils keys used.
     # Each configuration description is
-    # (<hook name>, <environment variable>, <key in distutils.cfg>, <convert>)
+    # (<hook name>, <environment variable>, <key in distutils.cfg>, <convert>, <append>)
     # The hook names are handled by the self._environment_hook method.
     #  - names starting with 'self.' call methods in this class
     #  - names starting with 'exe.' return the key in the executables dict
@@ -101,43 +102,43 @@ class FCompiler(CCompiler):
 
     distutils_vars = EnvironmentConfig(
         distutils_section='config_fc',
-        noopt = (None, None, 'noopt', str2bool),
-        noarch = (None, None, 'noarch', str2bool),
-        debug = (None, None, 'debug', str2bool),
-        verbose = (None, None, 'verbose', str2bool),
+        noopt = (None, None, 'noopt', str2bool, False),
+        noarch = (None, None, 'noarch', str2bool, False),
+        debug = (None, None, 'debug', str2bool, False),
+        verbose = (None, None, 'verbose', str2bool, False),
     )
 
     command_vars = EnvironmentConfig(
         distutils_section='config_fc',
-        compiler_f77 = ('exe.compiler_f77', 'F77', 'f77exec', None),
-        compiler_f90 = ('exe.compiler_f90', 'F90', 'f90exec', None),
-        compiler_fix = ('exe.compiler_fix', 'F90', 'f90exec', None),
-        version_cmd = ('exe.version_cmd', None, None, None),
-        linker_so = ('exe.linker_so', 'LDSHARED', 'ldshared', None),
-        linker_exe = ('exe.linker_exe', 'LD', 'ld', None),
-        archiver = (None, 'AR', 'ar', None),
-        ranlib = (None, 'RANLIB', 'ranlib', None),
+        compiler_f77 = ('exe.compiler_f77', 'F77', 'f77exec', None, False),
+        compiler_f90 = ('exe.compiler_f90', 'F90', 'f90exec', None, False),
+        compiler_fix = ('exe.compiler_fix', 'F90', 'f90exec', None, False),
+        version_cmd = ('exe.version_cmd', None, None, None, False),
+        linker_so = ('exe.linker_so', 'LDSHARED', 'ldshared', None, False),
+        linker_exe = ('exe.linker_exe', 'LD', 'ld', None, False),
+        archiver = (None, 'AR', 'ar', None, False),
+        ranlib = (None, 'RANLIB', 'ranlib', None, False),
     )
 
     flag_vars = EnvironmentConfig(
         distutils_section='config_fc',
-        f77 = ('flags.f77', 'F77FLAGS', 'f77flags', flaglist),
-        f90 = ('flags.f90', 'F90FLAGS', 'f90flags', flaglist),
-        free = ('flags.free', 'FREEFLAGS', 'freeflags', flaglist),
-        fix = ('flags.fix', None, None, flaglist),
-        opt = ('flags.opt', 'FOPT', 'opt', flaglist),
-        opt_f77 = ('flags.opt_f77', None, None, flaglist),
-        opt_f90 = ('flags.opt_f90', None, None, flaglist),
-        arch = ('flags.arch', 'FARCH', 'arch', flaglist),
-        arch_f77 = ('flags.arch_f77', None, None, flaglist),
-        arch_f90 = ('flags.arch_f90', None, None, flaglist),
-        debug = ('flags.debug', 'FDEBUG', 'fdebug', flaglist),
-        debug_f77 = ('flags.debug_f77', None, None, flaglist),
-        debug_f90 = ('flags.debug_f90', None, None, flaglist),
-        flags = ('self.get_flags', 'FFLAGS', 'fflags', flaglist),
-        linker_so = ('flags.linker_so', 'LDFLAGS', 'ldflags', flaglist),
-        linker_exe = ('flags.linker_exe', 'LDFLAGS', 'ldflags', flaglist),
-        ar = ('flags.ar', 'ARFLAGS', 'arflags', flaglist),
+        f77 = ('flags.f77', 'F77FLAGS', 'f77flags', flaglist, True),
+        f90 = ('flags.f90', 'F90FLAGS', 'f90flags', flaglist, True),
+        free = ('flags.free', 'FREEFLAGS', 'freeflags', flaglist, True),
+        fix = ('flags.fix', None, None, flaglist, False),
+        opt = ('flags.opt', 'FOPT', 'opt', flaglist, True),
+        opt_f77 = ('flags.opt_f77', None, None, flaglist, False),
+        opt_f90 = ('flags.opt_f90', None, None, flaglist, False),
+        arch = ('flags.arch', 'FARCH', 'arch', flaglist, False),
+        arch_f77 = ('flags.arch_f77', None, None, flaglist, False),
+        arch_f90 = ('flags.arch_f90', None, None, flaglist, False),
+        debug = ('flags.debug', 'FDEBUG', 'fdebug', flaglist, True),
+        debug_f77 = ('flags.debug_f77', None, None, flaglist, False),
+        debug_f90 = ('flags.debug_f90', None, None, flaglist, False),
+        flags = ('self.get_flags', 'FFLAGS', 'fflags', flaglist, True),
+        linker_so = ('flags.linker_so', 'LDFLAGS', 'ldflags', flaglist, True),
+        linker_exe = ('flags.linker_exe', 'LDFLAGS', 'ldflags', flaglist, True),
+        ar = ('flags.ar', 'ARFLAGS', 'arflags', flaglist, True),
     )
 
     language_map = {'.f': 'f77',
diff --git a/numpy/distutils/environment.py b/numpy/distutils/fcompiler/environment.py
index 3798e16f5..489784580 100644
--- a/numpy/distutils/environment.py
+++ b/numpy/distutils/fcompiler/environment.py
@@ -14,7 +14,7 @@ class EnvironmentConfig(object):
 
     def dump_variable(self, name):
         conf_desc = self._conf_keys[name]
-        hook, envvar, confvar, convert = conf_desc
+        hook, envvar, confvar, convert, append = conf_desc
         if not convert:
             convert = lambda x : x
         print('%s.%s:' % (self._distutils_section, name))
@@ -49,10 +49,15 @@ class EnvironmentConfig(object):
         return var
 
     def _get_var(self, name, conf_desc):
-        hook, envvar, confvar, convert = conf_desc
+        hook, envvar, confvar, convert, append = conf_desc
         var = self._hook_handler(name, hook)
         if envvar is not None:
-            var = os.environ.get(envvar, var)
+            envvar_contents = os.environ.get(envvar)
+            if envvar_contents is not None:
+                if var and append and os.environ.get('NPY_DISTUTILS_APPEND_FLAGS', '0') == '1':
+                    var = var + [envvar_contents]
+                else:
+                    var = envvar_contents
         if confvar is not None and self._conf:
             var = self._conf.get(confvar, (None, var))[1]
         if convert is not None:
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index 41f0b1f61..8305aeae5 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -257,7 +257,7 @@ def minrelpath(path):
     return os.sep.join(l)
 
 def sorted_glob(fileglob):
-    """sorts output of python glob for http://bugs.python.org/issue30461
+    """sorts output of python glob for https://bugs.python.org/issue30461
     to allow extensions to have reproducible build results"""
     return sorted(glob.glob(fileglob))
 
@@ -317,7 +317,7 @@ def make_temp_file(suffix='', prefix='', text=True):
     return fo, name
 
 # Hooks for colored terminal output.
-# See also http://www.livinglogic.de/Python/ansistyle
+# See also https://web.archive.org/web/20100314204946/http://www.livinglogic.de/Python/ansistyle
 def terminal_has_colors():
     if sys.platform=='cygwin' and 'USE_COLOR' not in os.environ:
         # Avoid importing curses that causes illegal operation
@@ -2300,19 +2300,9 @@ import sys
 
 extra_dll_dir = os.path.join(os.path.dirname(__file__), '.libs')
 
-if os.path.isdir(extra_dll_dir) and sys.platform == 'win32':
-    try:
-        from ctypes import windll, c_wchar_p
-        _AddDllDirectory = windll.kernel32.AddDllDirectory
-        _AddDllDirectory.argtypes = [c_wchar_p]
-        # Needed to initialize AddDllDirectory modifications
-        windll.kernel32.SetDefaultDllDirectories(0x1000)
-    except AttributeError:
-        def _AddDllDirectory(dll_directory):
-            os.environ.setdefault('PATH', '')
-            os.environ['PATH'] += os.pathsep + dll_directory
-
-    _AddDllDirectory(extra_dll_dir)
+if sys.platform == 'win32' and os.path.isdir(extra_dll_dir):
+    os.environ.setdefault('PATH', '')
+    os.environ['PATH'] += os.pathsep + extra_dll_dir
 
 """)
 
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index 65d7de316..a5693bdd5 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -487,7 +487,7 @@ class FFTWNotFoundError(NotFoundError):
 
 class DJBFFTNotFoundError(NotFoundError):
     """
-    DJBFFT (http://cr.yp.to/djbfft.html) libraries not found.
+    DJBFFT (https://cr.yp.to/djbfft.html) libraries not found.
     Directories to search for the libraries can be specified in the
     numpy/distutils/site.cfg file (section [djbfft]) or by setting
     the DJBFFT environment variable."""
@@ -495,7 +495,7 @@ class DJBFFTNotFoundError(NotFoundError):
 
 class NumericNotFoundError(NotFoundError):
     """
-    Numeric (http://www.numpy.org/) module not found.
+    Numeric (https://www.numpy.org/) module not found.
     Get it from above location, install it, and retry setup.py."""
 
 
@@ -505,7 +505,7 @@ class X11NotFoundError(NotFoundError):
 
 class UmfpackNotFoundError(NotFoundError):
     """
-    UMFPACK sparse solver (http://www.cise.ufl.edu/research/sparse/umfpack/)
+    UMFPACK sparse solver (https://www.cise.ufl.edu/research/sparse/umfpack/)
     not found. Directories to search for the libraries can be specified in the
     numpy/distutils/site.cfg file (section [umfpack]) or by setting
     the UMFPACK environment variable."""
diff --git a/numpy/distutils/tests/test_fcompiler.py b/numpy/distutils/tests/test_fcompiler.py
new file mode 100644
index 000000000..95e44b051
--- /dev/null
+++ b/numpy/distutils/tests/test_fcompiler.py
@@ -0,0 +1,44 @@
+from __future__ import division, absolute_import, print_function
+
+from numpy.testing import assert_
+import numpy.distutils.fcompiler
+
+customizable_flags = [
+    ('f77', 'F77FLAGS'),
+    ('f90', 'F90FLAGS'),
+    ('free', 'FREEFLAGS'),
+    ('arch', 'FARCH'),
+    ('debug', 'FDEBUG'),
+    ('flags', 'FFLAGS'),
+    ('linker_so', 'LDFLAGS'),
+]
+
+
+def test_fcompiler_flags(monkeypatch):
+    monkeypatch.setenv('NPY_DISTUTILS_APPEND_FLAGS', '0')
+    fc = numpy.distutils.fcompiler.new_fcompiler(compiler='none')
+    flag_vars = fc.flag_vars.clone(lambda *args, **kwargs: None)
+
+    for opt, envvar in customizable_flags:
+        new_flag = '-dummy-{}-flag'.format(opt)
+        prev_flags = getattr(flag_vars, opt)
+
+        monkeypatch.setenv(envvar, new_flag)
+        new_flags = getattr(flag_vars, opt)
+        monkeypatch.delenv(envvar)
+        assert_(new_flags == [new_flag])
+
+    monkeypatch.setenv('NPY_DISTUTILS_APPEND_FLAGS', '1')
+
+    for opt, envvar in customizable_flags:
+        new_flag = '-dummy-{}-flag'.format(opt)
+        prev_flags = getattr(flag_vars, opt)
+
+        monkeypatch.setenv(envvar, new_flag)
+        new_flags = getattr(flag_vars, opt)
+        monkeypatch.delenv(envvar)
+        if prev_flags is None:
+            assert_(new_flags == [new_flag])
+        else:
+            assert_(new_flags == prev_flags + [new_flag])
+
diff --git a/numpy/doc/broadcasting.py b/numpy/doc/broadcasting.py
index 717914cda..1dc4f60bf 100644
--- a/numpy/doc/broadcasting.py
+++ b/numpy/doc/broadcasting.py
@@ -171,7 +171,7 @@ Here the ``newaxis`` index operator inserts a new axis into ``a``,
 making it a two-dimensional ``4x1`` array.  Combining the ``4x1`` array
 with ``b``, which has shape ``(3,)``, yields a ``4x3`` array.
 
-See `this article <http://wiki.scipy.org/EricsBroadcastingDoc>`_
+See `this article <https://scipy.github.io/old-wiki/pages/EricsBroadcastingDoc>`_
 for illustrations of broadcasting concepts.
 
 """
diff --git a/numpy/doc/glossary.py b/numpy/doc/glossary.py
index 0e1df495b..a3b9423a8 100644
--- a/numpy/doc/glossary.py
+++ b/numpy/doc/glossary.py
@@ -69,7 +69,7 @@ Glossary
        micro-processors and used for transmission of data over network protocols.
 
    BLAS
-       `Basic Linear Algebra Subprograms <http://en.wikipedia.org/wiki/BLAS>`_
+       `Basic Linear Algebra Subprograms <https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms>`_
 
    broadcast
        NumPy can do operations on arrays whose shapes are mismatched::
@@ -155,7 +155,7 @@ Glossary
        be used as keys.
 
        For more information on dictionaries, read the
-       `Python tutorial <http://docs.python.org/tut>`_.
+       `Python tutorial <https://docs.python.org/tutorial/>`_.
 
    field
        In a :term:`structured data type`, each sub-type is called a `field`.
@@ -238,7 +238,7 @@ Glossary
                 [3, 4]])
 
        For more information, read the section on lists in the `Python
-       tutorial <http://docs.python.org/tut>`_.  For a mapping
+       tutorial <https://docs.python.org/tutorial/>`_.  For a mapping
        type (key-value), see *dictionary*.
 
    little-endian
diff --git a/numpy/doc/indexing.py b/numpy/doc/indexing.py
index 5f5033117..087a688bc 100644
--- a/numpy/doc/indexing.py
+++ b/numpy/doc/indexing.py
@@ -93,7 +93,7 @@ well. A few examples illustrates best: ::
         [21, 24, 27]])
 
 Note that slices of arrays do not copy the internal array data but
-also produce new views of the original data.
+only produce new views of the original data.
 
 It is possible to index arrays with other arrays for the purposes of
 selecting lists of values out of arrays into new arrays. There are
diff --git a/numpy/doc/misc.py b/numpy/doc/misc.py
index 24369871c..a76abe164 100644
--- a/numpy/doc/misc.py
+++ b/numpy/doc/misc.py
@@ -209,7 +209,7 @@ Only a survey of the choices. Little detail on how each works.
 Interfacing to Fortran:
 -----------------------
 The clear choice to wrap Fortran code is
-`f2py <http://docs.scipy.org/doc/numpy/f2py/>`_.
+`f2py <https://docs.scipy.org/doc/numpy/f2py/>`_.
 
 Pyfort is an older alternative, but not supported any longer.
 Fwrap is a newer project that looked promising but isn't being developed any
diff --git a/numpy/doc/structured_arrays.py b/numpy/doc/structured_arrays.py
index ba667da59..ab97c5df6 100644
--- a/numpy/doc/structured_arrays.py
+++ b/numpy/doc/structured_arrays.py
@@ -133,10 +133,9 @@ summary they are:
 
      Offsets may be chosen such that the fields overlap, though this will mean
      that assigning to one field may clobber any overlapping field's data. As
-     an exception, fields of :class:`numpy.object` type .. (see 
-     :ref:`object arrays <arrays.object>`) cannot overlap with other fields,
-     because of the risk of clobbering the internal object pointer and then
-     dereferencing it.
+     an exception, fields of :class:`numpy.object` type cannot overlap with
+     other fields, because of the risk of clobbering the internal object
+     pointer and then dereferencing it.
 
      The optional 'aligned' value can be set to ``True`` to make the automatic
      offset computation use aligned offsets (see :ref:`offsets-and-alignment`),
@@ -235,6 +234,11 @@ If the offsets of the fields and itemsize of a structured array satisfy the
 alignment conditions, the array will have the ``ALIGNED`` :ref:`flag
 <numpy.ndarray.flags>` set.
 
+A convenience function :func:`numpy.lib.recfunctions.repack_fields` converts an
+aligned dtype or array to a packed one and vice versa. It takes either a dtype
+or structured ndarray as an argument, and returns a copy with fields re-packed,
+with or without padding bytes.
+
 .. _titles:
 
 Field Titles
@@ -396,27 +400,61 @@ typically a non-structured array, except in the case of nested structures.
 Accessing Multiple Fields
 ```````````````````````````
 
-One can index a structured array with a multi-field index, where the index is a
-list of field names::
+One can index and assign to a structured array with a multi-field index, where
+the index is a list of field names.
+
+.. warning::
+    The behavior of multi-field indexes will change from Numpy 1.15 to Numpy
+    1.16.
 
- >>> a = np.zeros(3, dtype=[('a', 'i8'), ('b', 'i4'), ('c', 'f8')])
+In Numpy 1.16, the result of indexing with a multi-field index will be a view
+into the original array, as follows::
+
+ >>> a = np.zeros(3, dtype=[('a', 'i4'), ('b', 'i4'), ('c', 'f4')])
  >>> a[['a', 'c']]
- array([(0, 0.0), (0, 0.0), (0, 0.0)],
-       dtype={'names':['a','c'], 'formats':['<i8','<f8'], 'offsets':[0,11], 'itemsize':19})
+ array([(0, 0.), (0, 0.), (0, 0.)],
+      dtype={'names':['a','c'], 'formats':['<i4','<f4'], 'offsets':[0,8], 'itemsize':12})
+
+Assignment to the view modifies the original array. The view's fields will be
+in the order they were indexed. Note that unlike for single-field indexing, the
+view's dtype has the same itemsize as the original array, and has fields at the
+same offsets as in the original array, and unindexed fields are merely missing.
+
+In Numpy 1.15, indexing an array with a multi-field index returns a copy of
+the result above for 1.16, but with fields packed together in memory as if
+passed through :func:`numpy.lib.recfunctions.repack_fields`. This is the
+behavior since Numpy 1.7.
+
+.. warning::
+   The new behavior in Numpy 1.16 leads to extra "padding" bytes at the
+   location of unindexed fields. You will need to update any code which depends
+   on the data having a "packed" layout. For instance code such as::
+
+    >>> a[['a','c']].view('i8')  # will fail in Numpy 1.16
+    ValueError: When changing to a smaller dtype, its size must be a divisor of the size of original dtype
+
+   will need to be changed. This code has raised a ``FutureWarning`` since
+   Numpy 1.12.
+
+   The following is a recommended fix, which will behave identically in Numpy
+   1.15 and Numpy 1.16::
+
+    >>> from numpy.lib.recfunctions import repack_fields
+    >>> repack_fields(a[['a','c']]).view('i8')  # supported 1.15 and 1.16
+    array([0, 0, 0])
+
+Assigning to an array with a multi-field index will behave the same in Numpy
+1.15 and Numpy 1.16. In both versions the assignment will modify the original
+array::
+
  >>> a[['a', 'c']] = (2, 3)
  >>> a
  array([(2, 0, 3.0), (2, 0, 3.0), (2, 0, 3.0)],
        dtype=[('a', '<i8'), ('b', '<i4'), ('c', '<f8')])
 
-The resulting array is a view into the original array, such that assignment to
-the view modifies the original array. The view's fields will be in the order
-they were indexed. Note that unlike for single-field indexing, the view's dtype
-has the same itemsize as the original array, and has fields at the same offsets
-as in the original array, and unindexed fields are merely missing.
-
-Since the view is a structured array itself, it obeys the assignment rules
-described above. For example, this means that one can swap the values of two
-fields using appropriate multi-field indexes::
+This obeys the structured array assignment rules described above. For example,
+this means that one can swap the values of two fields using appropriate
+multi-field indexes::
 
  >>> a[['a', 'c']] = a[['c', 'a']]
 
diff --git a/numpy/doc/subclassing.py b/numpy/doc/subclassing.py
index 3be3d94b3..4b983893a 100644
--- a/numpy/doc/subclassing.py
+++ b/numpy/doc/subclassing.py
@@ -108,7 +108,7 @@ A brief Python primer on ``__new__`` and ``__init__``
 ``__new__`` is a standard Python method, and, if present, is called
 before ``__init__`` when we create a class instance. See the `python
 __new__ documentation
-<http://docs.python.org/reference/datamodel.html#object.__new__>`_ for more detail.
+<https://docs.python.org/reference/datamodel.html#object.__new__>`_ for more detail.
 
 For example, consider the following Python code:
 
diff --git a/numpy/dual.py b/numpy/dual.py
index 8b91da262..3a16a8ec5 100644
--- a/numpy/dual.py
+++ b/numpy/dual.py
@@ -7,7 +7,7 @@ developers to transparently support these accelerated functions when
 scipy is available but still support users who have only installed
 NumPy.
 
-.. _Scipy : http://www.scipy.org
+.. _Scipy : https://www.scipy.org
 
 """
 from __future__ import division, absolute_import, print_function
diff --git a/numpy/f2py/__init__.py b/numpy/f2py/__init__.py
index 5075c682d..fbb64f762 100644
--- a/numpy/f2py/__init__.py
+++ b/numpy/f2py/__init__.py
@@ -69,6 +69,6 @@ def compile(source,
         f.close()
     return status
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/f2py/rules.py b/numpy/f2py/rules.py
index 36e2222ea..23d36b2c2 100644
--- a/numpy/f2py/rules.py
+++ b/numpy/f2py/rules.py
@@ -1452,7 +1452,7 @@ def buildapi(rout):
                 ['\\begin{description}'] + rd[k][1:] +\
                 ['\\end{description}']
 
-    # Workaround for Python 2.6, 2.6.1 bug: http://bugs.python.org/issue4720
+    # Workaround for Python 2.6, 2.6.1 bug: https://bugs.python.org/issue4720
     if rd['keyformat'] or rd['xaformat']:
         argformat = rd['argformat']
         if isinstance(argformat, list):
diff --git a/numpy/fft/__init__.py b/numpy/fft/__init__.py
index bbb6ec8c7..44243b483 100644
--- a/numpy/fft/__init__.py
+++ b/numpy/fft/__init__.py
@@ -6,6 +6,6 @@ from .info import __doc__
 from .fftpack import *
 from .helper import *
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/lib/__init__.py b/numpy/lib/__init__.py
index d764cdc7e..dc40ac67b 100644
--- a/numpy/lib/__init__.py
+++ b/numpy/lib/__init__.py
@@ -46,6 +46,6 @@ __all__ += financial.__all__
 __all__ += nanfunctions.__all__
 __all__ += histograms.__all__
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index 4d3f35183..5880ea154 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -607,6 +607,14 @@ def isin(element, test_elements, assume_unique=False, invert=False):
            [ True,  False]])
     >>> element[mask]
     array([2, 4])
+
+    The indices of the matched values can be obtained with `nonzero`:
+
+    >>> np.nonzero(mask)
+    (array([0, 1]), array([1, 0]))
+
+    The test can also be inverted:
+
     >>> mask = np.isin(element, test_elements, invert=True)
     >>> mask
     array([[ True, False],
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 23eac7e7d..ef5ec57e3 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -150,7 +150,7 @@ Notes
 -----
 The ``.npy`` format, including motivation for creating it and a comparison of
 alternatives, is described in the `"npy-format" NEP 
-<http://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have
+<https://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have
 evolved with time and this document is more current.
 
 """
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 95edb95fa..75a39beaa 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -27,10 +27,11 @@ from numpy.core.fromnumeric import (
     ravel, nonzero, sort, partition, mean, any, sum
     )
 from numpy.core.numerictypes import typecodes, number
+from numpy.core.function_base import add_newdoc
 from numpy.lib.twodim_base import diag
 from .utils import deprecate
 from numpy.core.multiarray import (
-    _insert, add_docstring, digitize, bincount, normalize_axis_index,
+    _insert, add_docstring, bincount, normalize_axis_index, _monotonicity,
     interp as compiled_interp, interp_complex as compiled_interp_complex
     )
 from numpy.core.umath import _add_newdoc_ufunc as add_newdoc_ufunc
@@ -1308,7 +1309,7 @@ def interp(x, xp, fp, left=None, right=None, period=None):
     return interp_func(x, xp, fp, left, right)
 
 
-def angle(z, deg=0):
+def angle(z, deg=False):
     """
     Return the angle of the complex argument.
 
@@ -1324,6 +1325,9 @@ def angle(z, deg=0):
     angle : ndarray or scalar
         The counterclockwise angle from the positive real axis on
         the complex plane, with dtype as numpy.float64.
+        
+        ..versionchanged:: 1.16.0
+            This function works on subclasses of ndarray like `ma.array`.
 
     See Also
     --------
@@ -1338,18 +1342,18 @@ def angle(z, deg=0):
     45.0
 
     """
-    if deg:
-        fact = 180/pi
-    else:
-        fact = 1.0
-    z = asarray(z)
-    if (issubclass(z.dtype.type, _nx.complexfloating)):
+    z = asanyarray(z)
+    if issubclass(z.dtype.type, _nx.complexfloating):
         zimag = z.imag
         zreal = z.real
     else:
         zimag = 0
         zreal = z
-    return arctan2(zimag, zreal) * fact
+
+    a = arctan2(zimag, zreal)
+    if deg:
+        a *= 180/pi
+    return a
 
 
 def unwrap(p, discont=pi, axis=-1):
@@ -1649,7 +1653,7 @@ def disp(mesg, device=None, linefeed=True):
     return
 
 
-# See http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
+# See https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
 _DIMENSION_NAME = r'\w+'
 _CORE_DIMENSION_LIST = '(?:{0:}(?:,{0:})*)?'.format(_DIMENSION_NAME)
 _ARGUMENT = r'\({}\)'.format(_CORE_DIMENSION_LIST)
@@ -1906,7 +1910,7 @@ class vectorize(object):
     References
     ----------
     .. [1] NumPy Reference, section `Generalized Universal Function API
-           <http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html>`_.
+           <https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html>`_.
     """
 
     def __init__(self, pyfunc, otypes=None, doc=None, excluded=None,
@@ -2561,7 +2565,7 @@ def bartlett(M):
     .. [3] A.V. Oppenheim and R.W. Schafer, "Discrete-Time Signal
            Processing", Prentice-Hall, 1999, pp. 468-471.
     .. [4] Wikipedia, "Window function",
-           http://en.wikipedia.org/wiki/Window_function
+           https://en.wikipedia.org/wiki/Window_function
     .. [5] W.H. Press,  B.P. Flannery, S.A. Teukolsky, and W.T. Vetterling,
            "Numerical Recipes", Cambridge University Press, 1986, page 429.
 
@@ -2661,7 +2665,7 @@ def hanning(M):
     .. [2] E.R. Kanasewich, "Time Sequence Analysis in Geophysics",
            The University of Alberta Press, 1975, pp. 106-108.
     .. [3] Wikipedia, "Window function",
-           http://en.wikipedia.org/wiki/Window_function
+           https://en.wikipedia.org/wiki/Window_function
     .. [4] W.H. Press,  B.P. Flannery, S.A. Teukolsky, and W.T. Vetterling,
            "Numerical Recipes", Cambridge University Press, 1986, page 425.
 
@@ -2759,7 +2763,7 @@ def hamming(M):
     .. [2] E.R. Kanasewich, "Time Sequence Analysis in Geophysics", The
            University of Alberta Press, 1975, pp. 109-110.
     .. [3] Wikipedia, "Window function",
-           http://en.wikipedia.org/wiki/Window_function
+           https://en.wikipedia.org/wiki/Window_function
     .. [4] W.H. Press,  B.P. Flannery, S.A. Teukolsky, and W.T. Vetterling,
            "Numerical Recipes", Cambridge University Press, 1986, page 425.
 
@@ -3036,7 +3040,7 @@ def kaiser(M, beta):
     .. [2] E.R. Kanasewich, "Time Sequence Analysis in Geophysics", The
            University of Alberta Press, 1975, pp. 177-178.
     .. [3] Wikipedia, "Window function",
-           http://en.wikipedia.org/wiki/Window_function
+           https://en.wikipedia.org/wiki/Window_function
 
     Examples
     --------
@@ -3124,7 +3128,7 @@ def sinc(x):
     .. [1] Weisstein, Eric W. "Sinc Function." From MathWorld--A Wolfram Web
            Resource. http://mathworld.wolfram.com/SincFunction.html
     .. [2] Wikipedia, "Sinc function",
-           http://en.wikipedia.org/wiki/Sinc_function
+           https://en.wikipedia.org/wiki/Sinc_function
 
     Examples
     --------
@@ -3398,9 +3402,9 @@ def _median(a, axis=None, out=None, overwrite_input=False):
 def percentile(a, q, axis=None, out=None,
                overwrite_input=False, interpolation='linear', keepdims=False):
     """
-    Compute the qth percentile of the data along the specified axis.
+    Compute the q-th percentile of the data along the specified axis.
 
-    Returns the qth percentile(s) of the array elements.
+    Returns the q-th percentile(s) of the array elements.
 
     Parameters
     ----------
@@ -3467,7 +3471,7 @@ def percentile(a, q, axis=None, out=None,
 
     Notes
     -----
-    Given a vector ``V`` of length ``N``, the ``q``-th percentile of
+    Given a vector ``V`` of length ``N``, the q-th percentile of
     ``V`` is the value ``q/100`` of the way from the minimum to the
     maximum in a sorted copy of ``V``. The values and distances of
     the two nearest neighbors as well as the `interpolation` parameter
@@ -3543,7 +3547,7 @@ def percentile(a, q, axis=None, out=None,
 def quantile(a, q, axis=None, out=None,
              overwrite_input=False, interpolation='linear', keepdims=False):
     """
-    Compute the `q`th quantile of the data along the specified axis.
+    Compute the q-th quantile of the data along the specified axis.
     ..versionadded:: 1.15.0
 
     Parameters
@@ -3569,6 +3573,7 @@ def quantile(a, q, axis=None, out=None,
         This optional parameter specifies the interpolation method to
         use when the desired quantile lies between two data points
         ``i < j``:
+
             * linear: ``i + (j - i) * fraction``, where ``fraction``
               is the fractional part of the index surrounded by ``i``
               and ``j``.
@@ -3602,7 +3607,7 @@ def quantile(a, q, axis=None, out=None,
 
     Notes
     -----
-    Given a vector ``V`` of length ``N``, the ``q``-th quantile of
+    Given a vector ``V`` of length ``N``, the q-th quantile of
     ``V`` is the value ``q`` of the way from the minimum to the
     maximum in a sorted copy of ``V``. The values and distances of
     the two nearest neighbors as well as the `interpolation` parameter
@@ -3720,7 +3725,7 @@ def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False,
             indices = concatenate((indices, [-1]))
 
         ap.partition(indices, axis=axis)
-        # ensure axis with qth is first
+        # ensure axis with q-th is first
         ap = np.moveaxis(ap, axis, 0)
         axis = 0
 
@@ -3753,7 +3758,7 @@ def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False,
 
         ap.partition(concatenate((indices_below, indices_above)), axis=axis)
 
-        # ensure axis with qth is first
+        # ensure axis with q-th is first
         ap = np.moveaxis(ap, axis, 0)
         weights_below = np.moveaxis(weights_below, axis, 0)
         weights_above = np.moveaxis(weights_above, axis, 0)
@@ -3767,7 +3772,7 @@ def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False,
         x1 = take(ap, indices_below, axis=axis) * weights_below
         x2 = take(ap, indices_above, axis=axis) * weights_above
 
-        # ensure axis with qth is first
+        # ensure axis with q-th is first
         x1 = np.moveaxis(x1, axis, 0)
         x2 = np.moveaxis(x2, axis, 0)
 
@@ -3840,10 +3845,10 @@ def trapz(y, x=None, dx=1.0, axis=-1):
 
     References
     ----------
-    .. [1] Wikipedia page: http://en.wikipedia.org/wiki/Trapezoidal_rule
+    .. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule
 
     .. [2] Illustration image:
-           http://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png
+           https://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png
 
     Examples
     --------
@@ -3891,41 +3896,6 @@ def trapz(y, x=None, dx=1.0, axis=-1):
     return ret
 
 
-#always succeed
-def add_newdoc(place, obj, doc):
-    """
-    Adds documentation to obj which is in module place.
-
-    If doc is a string add it to obj as a docstring
-
-    If doc is a tuple, then the first element is interpreted as
-       an attribute of obj and the second as the docstring
-          (method, docstring)
-
-    If doc is a list, then each element of the list should be a
-       sequence of length two --> [(method1, docstring1),
-       (method2, docstring2), ...]
-
-    This routine never raises an error.
-
-    This routine cannot modify read-only docstrings, as appear
-    in new-style classes or built-in functions. Because this
-    routine never raises an error the caller must check manually
-    that the docstrings were changed.
-    """
-    try:
-        new = getattr(__import__(place, globals(), {}, [obj]), obj)
-        if isinstance(doc, str):
-            add_docstring(new, doc.strip())
-        elif isinstance(doc, tuple):
-            add_docstring(getattr(new, doc[0]), doc[1].strip())
-        elif isinstance(doc, list):
-            for val in doc:
-                add_docstring(getattr(new, val[0]), val[1].strip())
-    except Exception:
-        pass
-
-
 # Based on scitools meshgrid
 def meshgrid(*xi, **kwargs):
     """
@@ -4022,11 +3992,13 @@ def meshgrid(*xi, **kwargs):
 
     `meshgrid` is very useful to evaluate functions on a grid.
 
+    >>> import matplotlib.pyplot as plt
     >>> x = np.arange(-5, 5, 0.1)
     >>> y = np.arange(-5, 5, 0.1)
     >>> xx, yy = np.meshgrid(x, y, sparse=True)
     >>> z = np.sin(xx**2 + yy**2) / (xx**2 + yy**2)
     >>> h = plt.contourf(x,y,z)
+    >>> plt.show()
 
     """
     ndim = len(xi)
@@ -4526,3 +4498,113 @@ def append(arr, values, axis=None):
         values = ravel(values)
         axis = arr.ndim-1
     return concatenate((arr, values), axis=axis)
+
+
+def digitize(x, bins, right=False):
+    """
+    Return the indices of the bins to which each value in input array belongs.
+
+    =========  =============  ============================
+    `right`    order of bins  returned index `i` satisfies
+    =========  =============  ============================
+    ``False``  increasing     ``bins[i-1] <= x < bins[i]``
+    ``True``   increasing     ``bins[i-1] < x <= bins[i]``
+    ``False``  decreasing     ``bins[i-1] > x >= bins[i]``
+    ``True``   decreasing     ``bins[i-1] >= x > bins[i]``
+    =========  =============  ============================
+
+    If values in `x` are beyond the bounds of `bins`, 0 or ``len(bins)`` is
+    returned as appropriate.
+
+    Parameters
+    ----------
+    x : array_like
+        Input array to be binned. Prior to NumPy 1.10.0, this array had to
+        be 1-dimensional, but can now have any shape.
+    bins : array_like
+        Array of bins. It has to be 1-dimensional and monotonic.
+    right : bool, optional
+        Indicating whether the intervals include the right or the left bin
+        edge. Default behavior is (right==False) indicating that the interval
+        does not include the right edge. The left bin end is open in this
+        case, i.e., bins[i-1] <= x < bins[i] is the default behavior for
+        monotonically increasing bins.
+
+    Returns
+    -------
+    indices : ndarray of ints
+        Output array of indices, of same shape as `x`.
+
+    Raises
+    ------
+    ValueError
+        If `bins` is not monotonic.
+    TypeError
+        If the type of the input is complex.
+
+    See Also
+    --------
+    bincount, histogram, unique, searchsorted
+
+    Notes
+    -----
+    If values in `x` are such that they fall outside the bin range,
+    attempting to index `bins` with the indices that `digitize` returns
+    will result in an IndexError.
+
+    .. versionadded:: 1.10.0
+
+    `np.digitize` is  implemented in terms of `np.searchsorted`. This means
+    that a binary search is used to bin the values, which scales much better
+    for larger number of bins than the previous linear search. It also removes
+    the requirement for the input array to be 1-dimensional.
+
+    For monotonically _increasing_ `bins`, the following are equivalent::
+
+        np.digitize(x, bins, right=True)
+        np.searchsorted(bins, x, side='left')
+
+    Note that as the order of the arguments are reversed, the side must be too.
+    The `searchsorted` call is marginally faster, as it does not do any
+    monotonicity checks. Perhaps more importantly, it supports all dtypes.
+
+    Examples
+    --------
+    >>> x = np.array([0.2, 6.4, 3.0, 1.6])
+    >>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
+    >>> inds = np.digitize(x, bins)
+    >>> inds
+    array([1, 4, 3, 2])
+    >>> for n in range(x.size):
+    ...   print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]])
+    ...
+    0.0 <= 0.2 < 1.0
+    4.0 <= 6.4 < 10.0
+    2.5 <= 3.0 < 4.0
+    1.0 <= 1.6 < 2.5
+
+    >>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.])
+    >>> bins = np.array([0, 5, 10, 15, 20])
+    >>> np.digitize(x,bins,right=True)
+    array([1, 2, 3, 4, 4])
+    >>> np.digitize(x,bins,right=False)
+    array([1, 3, 3, 4, 5])
+    """
+    x = _nx.asarray(x)
+    bins = _nx.asarray(bins)
+
+    # here for compatibility, searchsorted below is happy to take this
+    if np.issubdtype(x.dtype, _nx.complexfloating):
+        raise TypeError("x may not be complex")
+
+    mono = _monotonicity(bins)
+    if mono == 0:
+        raise ValueError("bins must be monotonically increasing or decreasing")
+
+    # this is backwards because the arguments below are swapped
+    side = 'left' if right else 'right'
+    if mono == -1:
+        # reverse the bins, and invert the results
+        return len(bins) - _nx.searchsorted(bins[::-1], x, side=side)
+    else:
+        return _nx.searchsorted(bins, x, side=side)
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 2922b3a86..422b356f7 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -4,6 +4,7 @@ Histogram-related functions
 from __future__ import division, absolute_import, print_function
 
 import operator
+import warnings
 
 import numpy as np
 from numpy.compat.py3k import basestring
@@ -559,7 +560,7 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
     return bin_edges
 
 
-def histogram(a, bins=10, range=None, normed=False, weights=None,
+def histogram(a, bins=10, range=None, normed=None, weights=None,
               density=None):
     r"""
     Compute the histogram of a set of data.
@@ -571,8 +572,8 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
     bins : int or sequence of scalars or str, optional
         If `bins` is an int, it defines the number of equal-width
         bins in the given range (10, by default). If `bins` is a
-        sequence, it defines the bin edges, including the rightmost
-        edge, allowing for non-uniform bin widths.
+        sequence, it defines a monotonically increasing array of bin edges,
+        including the rightmost edge, allowing for non-uniform bin widths.
 
         .. versionadded:: 1.11.0
 
@@ -591,14 +592,12 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
 
         .. deprecated:: 1.6.0
 
-        This keyword is deprecated in NumPy 1.6.0 due to confusing/buggy
-        behavior. It will be removed in NumPy 2.0.0. Use the ``density``
-        keyword instead. If ``False``, the result will contain the
-        number of samples in each bin. If ``True``, the result is the
-        value of the probability *density* function at the bin,
-        normalized such that the *integral* over the range is 1. Note
-        that this latter behavior is known to be buggy with unequal bin
-        widths; use ``density`` instead.
+        This is equivalent to the `density` argument, but produces incorrect
+        results for unequal bin widths. It should not be used.
+
+        .. versionchanged:: 1.15.0
+            DeprecationWarnings are actually emitted.
+
     weights : array_like, optional
         An array of weights, of the same shape as `a`.  Each value in
         `a` only contributes its associated weight towards the bin count
@@ -777,20 +776,44 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
 
     # density overrides the normed keyword
     if density is not None:
-        normed = False
+        if normed is not None:
+            # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6)
+            warnings.warn(
+                    "The normed argument is ignored when density is provided. "
+                    "In future passing both will result in an error.",
+                    DeprecationWarning, stacklevel=2)
+        normed = None
 
     if density:
         db = np.array(np.diff(bin_edges), float)
         return n/db/n.sum(), bin_edges
     elif normed:
-        # deprecated, buggy behavior. Remove for NumPy 2.0.0
+        # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6)
+        warnings.warn(
+                "Passing `normed=True` on non-uniform bins has always been "
+                "broken, and computes neither the probability density "
+                "function nor the probability mass function. "
+                "The result is only correct if the bins are uniform, when "
+                "density=True will produce the same result anyway. "
+                "The argument will be removed in a future version of "
+                "numpy.",
+                np.VisibleDeprecationWarning, stacklevel=2)
+
+        # this normalization is incorrect, but
         db = np.array(np.diff(bin_edges), float)
         return n/(n*db).sum(), bin_edges
     else:
+        if normed is not None:
+            # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6)
+            warnings.warn(
+                    "Passing normed=False is deprecated, and has no effect. "
+                    "Consider passing the density argument instead.",
+                    DeprecationWarning, stacklevel=2)
         return n, bin_edges
 
 
-def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
+def histogramdd(sample, bins=10, range=None, normed=None, weights=None,
+                density=None):
     """
     Compute the multidimensional histogram of some data.
 
@@ -811,7 +834,8 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
     bins : sequence or int, optional
         The bin specification:
 
-        * A sequence of arrays describing the bin edges along each dimension.
+        * A sequence of arrays describing the monotonically increasing bin
+          edges along each dimension.
         * The number of bins for each dimension (nx, ny, ... =bins)
         * The number of bins for all dimensions (nx=ny=...=bins).
 
@@ -822,9 +846,14 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
         An entry of None in the sequence results in the minimum and maximum
         values being used for the corresponding dimension.
         The default, None, is equivalent to passing a tuple of D None values.
+    density : bool, optional
+        If False, the default, returns the number of samples in each bin.
+        If True, returns the probability *density* function at the bin,
+        ``bin_count / sample_count / bin_volume``.
     normed : bool, optional
-        If False, returns the number of samples in each bin. If True,
-        returns the bin density ``bin_count / sample_count / bin_volume``.
+        An alias for the density argument that behaves identically. To avoid
+        confusion with the broken normed argument to `histogram`, `density`
+        should be preferred.
     weights : (N,) array_like, optional
         An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.
         Weights are normalized to 1 if normed is True. If normed is False,
@@ -938,8 +967,18 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
     core = D*(slice(1, -1),)
     hist = hist[core]
 
-    # Normalize if normed is True
-    if normed:
+    # handle the aliasing normed argument
+    if normed is None:
+        if density is None:
+            density = False
+    elif density is None:
+        # an explicit normed argument was passed, alias it to the new name
+        density = normed
+    else:
+        raise TypeError("Cannot specify both 'normed' and 'density'")
+
+    if density:
+        # calculate the probability density function
         s = hist.sum()
         for i in _range(D):
             shape = np.ones(D, int)
diff --git a/numpy/lib/index_tricks.py b/numpy/lib/index_tricks.py
index d2139338e..009e6d229 100644
--- a/numpy/lib/index_tricks.py
+++ b/numpy/lib/index_tricks.py
@@ -121,39 +121,13 @@ class nd_grid(object):
     Notes
     -----
     Two instances of `nd_grid` are made available in the NumPy namespace,
-    `mgrid` and `ogrid`::
+    `mgrid` and `ogrid`, approximately defined as::
 
         mgrid = nd_grid(sparse=False)
         ogrid = nd_grid(sparse=True)
 
     Users should use these pre-defined instances instead of using `nd_grid`
     directly.
-
-    Examples
-    --------
-    >>> mgrid = np.lib.index_tricks.nd_grid()
-    >>> mgrid[0:5,0:5]
-    array([[[0, 0, 0, 0, 0],
-            [1, 1, 1, 1, 1],
-            [2, 2, 2, 2, 2],
-            [3, 3, 3, 3, 3],
-            [4, 4, 4, 4, 4]],
-           [[0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4],
-            [0, 1, 2, 3, 4]]])
-    >>> mgrid[-1:1:5j]
-    array([-1. , -0.5,  0. ,  0.5,  1. ])
-
-    >>> ogrid = np.lib.index_tricks.nd_grid(sparse=True)
-    >>> ogrid[0:5,0:5]
-    [array([[0],
-            [1],
-            [2],
-            [3],
-            [4]]), array([[0, 1, 2, 3, 4]])]
-
     """
 
     def __init__(self, sparse=False):
@@ -223,10 +197,97 @@ class nd_grid(object):
     def __len__(self):
         return 0
 
-mgrid = nd_grid(sparse=False)
-ogrid = nd_grid(sparse=True)
-mgrid.__doc__ = None  # set in numpy.add_newdocs
-ogrid.__doc__ = None  # set in numpy.add_newdocs
+
+class MGridClass(nd_grid):
+    """
+    `nd_grid` instance which returns a dense multi-dimensional "meshgrid".
+
+    An instance of `numpy.lib.index_tricks.nd_grid` which returns an dense
+    (or fleshed out) mesh-grid when indexed, so that each returned argument
+    has the same shape.  The dimensions and number of the output arrays are
+    equal to the number of indexing dimensions.  If the step length is not a
+    complex number, then the stop is not inclusive.
+
+    However, if the step length is a **complex number** (e.g. 5j), then
+    the integer part of its magnitude is interpreted as specifying the
+    number of points to create between the start and stop values, where
+    the stop value **is inclusive**.
+
+    Returns
+    ----------
+    mesh-grid `ndarrays` all of the same dimensions
+
+    See Also
+    --------
+    numpy.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
+    ogrid : like mgrid but returns open (not fleshed out) mesh grids
+    r_ : array concatenator
+
+    Examples
+    --------
+    >>> np.mgrid[0:5,0:5]
+    array([[[0, 0, 0, 0, 0],
+            [1, 1, 1, 1, 1],
+            [2, 2, 2, 2, 2],
+            [3, 3, 3, 3, 3],
+            [4, 4, 4, 4, 4]],
+           [[0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4]]])
+    >>> np.mgrid[-1:1:5j]
+    array([-1. , -0.5,  0. ,  0.5,  1. ])
+
+    """
+    def __init__(self):
+        super(MGridClass, self).__init__(sparse=False)
+
+mgrid = MGridClass()
+
+class OGridClass(nd_grid):
+    """
+    `nd_grid` instance which returns an open multi-dimensional "meshgrid".
+
+    An instance of `numpy.lib.index_tricks.nd_grid` which returns an open
+    (i.e. not fleshed out) mesh-grid when indexed, so that only one dimension
+    of each returned array is greater than 1.  The dimension and number of the
+    output arrays are equal to the number of indexing dimensions.  If the step
+    length is not a complex number, then the stop is not inclusive.
+
+    However, if the step length is a **complex number** (e.g. 5j), then
+    the integer part of its magnitude is interpreted as specifying the
+    number of points to create between the start and stop values, where
+    the stop value **is inclusive**.
+
+    Returns
+    ----------
+    mesh-grid `ndarrays` with only one dimension :math:`\\neq 1`
+
+    See Also
+    --------
+    np.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
+    mgrid : like `ogrid` but returns dense (or fleshed out) mesh grids
+    r_ : array concatenator
+
+    Examples
+    --------
+    >>> from numpy import ogrid
+    >>> ogrid[-1:1:5j]
+    array([-1. , -0.5,  0. ,  0.5,  1. ])
+    >>> ogrid[0:5,0:5]
+    [array([[0],
+            [1],
+            [2],
+            [3],
+            [4]]), array([[0, 1, 2, 3, 4]])]
+
+    """
+    def __init__(self):
+        super(OGridClass, self).__init__(sparse=True)
+
+ogrid = OGridClass()
+
 
 class AxisConcatenator(object):
     """
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 390927601..d8cfbf769 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -412,12 +412,13 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
     try:
         # Code to distinguish from NumPy binary files and pickles.
         _ZIP_PREFIX = b'PK\x03\x04'
+        _ZIP_SUFFIX = b'PK\x05\x06' # empty zip files start with this
         N = len(format.MAGIC_PREFIX)
         magic = fid.read(N)
         # If the file size is less than N, we need to make sure not
         # to seek past the beginning of the file
         fid.seek(-min(N, len(magic)), 1)  # back-up
-        if magic.startswith(_ZIP_PREFIX):
+        if magic.startswith(_ZIP_PREFIX) or magic.startswith(_ZIP_SUFFIX):
             # zip-file (assume .npz)
             # Transfer file ownership to NpzFile
             tmp = own_fid
@@ -1259,8 +1260,8 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
     References
     ----------
     .. [1] `Format Specification Mini-Language
-           <http://docs.python.org/library/string.html#
-           format-specification-mini-language>`_, Python Documentation.
+           <https://docs.python.org/library/string.html#format-specification-mini-language>`_,
+           Python Documentation.
 
     Examples
     --------
@@ -1624,7 +1625,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     References
     ----------
     .. [1] NumPy User Guide, section `I/O with NumPy
-           <http://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_.
+           <https://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_.
 
     Examples
     ---------
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index 078608bbb..0e691f56e 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -494,9 +494,9 @@ def polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False):
     References
     ----------
     .. [1] Wikipedia, "Curve fitting",
-           http://en.wikipedia.org/wiki/Curve_fitting
+           https://en.wikipedia.org/wiki/Curve_fitting
     .. [2] Wikipedia, "Polynomial interpolation",
-           http://en.wikipedia.org/wiki/Polynomial_interpolation
+           https://en.wikipedia.org/wiki/Polynomial_interpolation
 
     Examples
     --------
diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index c455bd93f..b6453d5a2 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -732,6 +732,84 @@ def rec_append_fields(base, names, data, dtypes=None):
     return append_fields(base, names, data=data, dtypes=dtypes,
                          asrecarray=True, usemask=False)
 
+def repack_fields(a, align=False, recurse=False):
+    """
+    Re-pack the fields of a structured array or dtype in memory.
+
+    The memory layout of structured datatypes allows fields at arbitrary
+    byte offsets. This means the fields can be separated by padding bytes,
+    their offsets can be non-monotonically increasing, and they can overlap.
+
+    This method removes any overlaps and reorders the fields in memory so they
+    have increasing byte offsets, and adds or removes padding bytes depending
+    on the `align` option, which behaves like the `align` option to `np.dtype`.
+
+    If `align=False`, this method produces a "packed" memory layout in which
+    each field starts at the byte the previous field ended, and any padding
+    bytes are removed.
+
+    If `align=True`, this methods produces an "aligned" memory layout in which
+    each field's offset is a multiple of its alignment, and the total itemsize
+    is a multiple of the largest alignment, by adding padding bytes as needed.
+
+    Parameters
+    ----------
+    a : ndarray or dtype
+       array or dtype for which to repack the fields.
+    align : boolean
+       If true, use an "aligned" memory layout, otherwise use a "packed" layout.
+    recurse : boolean
+       If True, also repack nested structures.
+
+    Returns
+    -------
+    repacked : ndarray or dtype
+       Copy of `a` with fields repacked, or `a` itself if no repacking was
+       needed.
+
+    Examples
+    --------
+
+    >>> def print_offsets(d):
+    ...     print("offsets:", [d.fields[name][1] for name in d.names])
+    ...     print("itemsize:", d.itemsize)
+    ...
+    >>> dt = np.dtype('u1,i4,f4', align=True)
+    >>> dt
+    dtype({'names':['f0','f1','f2'], 'formats':['u1','<i4','<f8'], 'offsets':[0,4,8], 'itemsize':16}, align=True)
+    >>> print_offsets(dt)
+    offsets: [0, 4, 8]
+    itemsize: 16
+    >>> packed_dt = repack_fields(dt)
+    >>> packed_dt
+    dtype([('f0', 'u1'), ('f1', '<i4'), ('f2', '<f8')])
+    >>> print_offsets(packed_dt)
+    offsets: [0, 1, 5]
+    itemsize: 13
+
+    """
+    if not isinstance(a, np.dtype):
+        dt = repack_fields(a.dtype, align=align, recurse=recurse)
+        return a.astype(dt, copy=False)
+
+    if a.names is None:
+        return a
+
+    fieldinfo = []
+    for name in a.names:
+        tup = a.fields[name]
+        if recurse:
+            fmt = repack_fields(tup[0], align=align, recurse=True)
+        else:
+            fmt = tup[0]
+
+        if len(tup) == 3:
+            name = (tup[2], name)
+
+        fieldinfo.append((name, fmt))
+
+    dt = np.dtype(fieldinfo, align=align)
+    return np.dtype((a.type, dt))
 
 def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
                  autoconvert=False):
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index 65104115a..d31d8a939 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -536,7 +536,11 @@ def expand_dims(a, axis):
     True
 
     """
-    a = asarray(a)
+    if isinstance(a, matrix):
+        a = asarray(a)
+    else:
+        a = asanyarray(a)
+
     shape = a.shape
     if axis > a.ndim or axis < -a.ndim - 1:
         # 2017-05-17, 1.13.0
diff --git a/numpy/lib/stride_tricks.py b/numpy/lib/stride_tricks.py
index 2abe5cdd1..bc5993802 100644
--- a/numpy/lib/stride_tricks.py
+++ b/numpy/lib/stride_tricks.py
@@ -219,23 +219,19 @@ def broadcast_arrays(*args, **kwargs):
     Examples
     --------
     >>> x = np.array([[1,2,3]])
-    >>> y = np.array([[1],[2],[3]])
+    >>> y = np.array([[4],[5]])
     >>> np.broadcast_arrays(x, y)
     [array([[1, 2, 3],
-           [1, 2, 3],
-           [1, 2, 3]]), array([[1, 1, 1],
-           [2, 2, 2],
-           [3, 3, 3]])]
+           [1, 2, 3]]), array([[4, 4, 4],
+           [5, 5, 5]])]
 
     Here is a useful idiom for getting contiguous copies instead of
     non-contiguous views.
 
     >>> [np.array(a) for a in np.broadcast_arrays(x, y)]
     [array([[1, 2, 3],
-           [1, 2, 3],
-           [1, 2, 3]]), array([[1, 1, 1],
-           [2, 2, 2],
-           [3, 3, 3]])]
+           [1, 2, 3]]), array([[4, 4, 4],
+           [5, 5, 5]])]
 
     """
     # nditer is not used here to avoid the limit of 32 arrays.
diff --git a/numpy/lib/tests/test_arraypad.py b/numpy/lib/tests/test_arraypad.py
index 8ba0370b0..45d624781 100644
--- a/numpy/lib/tests/test_arraypad.py
+++ b/numpy/lib/tests/test_arraypad.py
@@ -1009,6 +1009,21 @@ class TestUnicodeInput(object):
         assert_array_equal(a, b)
 
 
+class TestObjectInput(object):
+    def test_object_input(self):
+        # Regression test for issue gh-11395.
+        a = np.full((4, 3), None)
+        pad_amt = ((2, 3), (3, 2))
+        b = np.full((9, 8), None)
+        modes = ['edge',
+                 'symmetric',
+                 'reflect',
+                 'wrap',
+                 ]
+        for mode in modes:
+            assert_array_equal(pad(a, pad_amt, mode=mode), b)
+
+
 class TestValueError1(object):
     def test_check_simple(self):
         arr = np.arange(30)
diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py
index 38a9b8000..c7869c582 100644
--- a/numpy/lib/tests/test_format.py
+++ b/numpy/lib/tests/test_format.py
@@ -479,7 +479,7 @@ def test_long_str():
 
 @pytest.mark.slow
 def test_memmap_roundtrip():
-    # Fixme: test crashes nose on windows.
+    # Fixme: used to crash on windows
     if not (sys.platform == 'win32' or sys.platform == 'cygwin'):
         for arr in basic_arrays + record_arrays:
             if arr.dtype.hasobject:
@@ -852,3 +852,10 @@ def test_large_archive():
         new_a = np.load(f)["arr"]
 
     assert_(a.shape == new_a.shape)
+
+
+def test_empty_npz():
+    # Test for gh-9989
+    fname = os.path.join(tempdir, "nothing.npz")
+    np.savez(fname)
+    np.load(fname)
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 4103a9eb3..d5faed6ae 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1043,6 +1043,16 @@ class TestAngle(object):
         assert_array_almost_equal(y, yo, 11)
         assert_array_almost_equal(z, zo, 11)
 
+    def test_subclass(self):
+        x = np.ma.array([1 + 3j, 1, np.sqrt(2)/2 * (1 + 1j)])
+        x[1] = np.ma.masked
+        expected = np.ma.array([np.arctan(3.0 / 1.0), 0, np.arctan(1.0)])
+        expected[1] = np.ma.masked
+        actual = angle(x)
+        assert_equal(type(actual), type(expected))
+        assert_equal(actual.mask, expected.mask)
+        assert_equal(actual, expected)
+
 
 class TestTrimZeros(object):
 
@@ -1510,6 +1520,18 @@ class TestDigitize(object):
         assert_(not isinstance(digitize(b, a, False), A))
         assert_(not isinstance(digitize(b, a, True), A))
 
+    def test_large_integers_increasing(self):
+        # gh-11022
+        x = 2**54  # loses precision in a float
+        assert_equal(np.digitize(x, [x - 1, x + 1]), 1)
+
+    @pytest.mark.xfail(
+        reason="gh-11022: np.core.multiarray._monoticity loses precision")
+    def test_large_integers_decreasing(self):
+        # gh-11022
+        x = 2**54  # loses precision in a float
+        assert_equal(np.digitize(x, [x + 1, x - 1]), 1)
+
 
 class TestUnwrap(object):
 
@@ -2237,6 +2259,14 @@ class TestInterp(object):
         x0 = np.nan
         assert_almost_equal(np.interp(x0, x, y), x0)
 
+    def test_non_finite_behavior(self):
+        x = [1, 2, 2.5, 3, 4]
+        xp = [1, 2, 3, 4]
+        fp = [1, 2, np.inf, 4]
+        assert_almost_equal(np.interp(x, xp, fp), [1, 2, np.inf, np.inf, 4])
+        fp = [1, 2, np.nan, 4]
+        assert_almost_equal(np.interp(x, xp, fp), [1, 2, np.nan, np.nan, 4])
+
     def test_complex_interp(self):
         # test complex interpolation
         x = np.linspace(0, 1, 5)
@@ -2251,6 +2281,12 @@ class TestInterp(object):
         x0 = 2.0
         right = 2 + 3.0j
         assert_almost_equal(np.interp(x0, x, y, right=right), right)
+        # test complex non finite
+        x = [1, 2, 2.5, 3, 4]
+        xp = [1, 2, 3, 4]
+        fp = [1, 2+1j, np.inf, 4]
+        y = [1, 2+1j, np.inf+0.5j, np.inf, 4]
+        assert_almost_equal(np.interp(x, xp, fp), y)
         # test complex periodic
         x = [-180, -170, -185, 185, -10, -5, 0, 365]
         xp = [190, -190, 350, -350]
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index e16ae12c2..f136b5c81 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -40,20 +40,28 @@ class TestHistogram(object):
         assert_allclose(e, np.array([1., 2.]))
 
     def test_normed(self):
-        # Check that the integral of the density equals 1.
-        n = 100
-        v = np.random.rand(n)
-        a, b = histogram(v, normed=True)
-        area = np.sum(a * np.diff(b))
-        assert_almost_equal(area, 1)
+        sup = suppress_warnings()
+        with sup:
+            rec = sup.record(np.VisibleDeprecationWarning, '.*normed.*')
+            # Check that the integral of the density equals 1.
+            n = 100
+            v = np.random.rand(n)
+            a, b = histogram(v, normed=True)
+            area = np.sum(a * np.diff(b))
+            assert_almost_equal(area, 1)
+            assert_equal(len(rec), 1)
 
-        # Check with non-constant bin widths (buggy but backwards
-        # compatible)
-        v = np.arange(10)
-        bins = [0, 1, 5, 9, 10]
-        a, b = histogram(v, bins, normed=True)
-        area = np.sum(a * np.diff(b))
-        assert_almost_equal(area, 1)
+        sup = suppress_warnings()
+        with sup:
+            rec = sup.record(np.VisibleDeprecationWarning, '.*normed.*')
+            # Check with non-constant bin widths (buggy but backwards
+            # compatible)
+            v = np.arange(10)
+            bins = [0, 1, 5, 9, 10]
+            a, b = histogram(v, bins, normed=True)
+            area = np.sum(a * np.diff(b))
+            assert_almost_equal(area, 1)
+            assert_equal(len(rec), 1)
 
     def test_density(self):
         # Check that the integral of the density equals 1.
@@ -70,6 +78,10 @@ class TestHistogram(object):
         assert_array_equal(a, .1)
         assert_equal(np.sum(a * np.diff(b)), 1)
 
+        # Test that passing False works too
+        a, b = histogram(v, bins, density=False)
+        assert_array_equal(a, [1, 2, 3, 4])
+
         # Variale bin widths are especially useful to deal with
         # infinities.
         v = np.arange(10)
@@ -96,12 +108,12 @@ class TestHistogram(object):
         assert_equal(h.sum(), 9)
 
         # Normalization
-        h, b = histogram(a, range=[1, 9], normed=True)
+        h, b = histogram(a, range=[1, 9], density=True)
         assert_almost_equal((h * np.diff(b)).sum(), 1, decimal=15)
 
         # Weights
         w = np.arange(10) + .5
-        h, b = histogram(a, range=[1, 9], weights=w, normed=True)
+        h, b = histogram(a, range=[1, 9], weights=w, density=True)
         assert_equal((h * np.diff(b)).sum(), 1)
 
         h, b = histogram(a, bins=8, range=[1, 9], weights=w)
@@ -113,7 +125,7 @@ class TestHistogram(object):
         h, b = histogram(a)
         assert_(np.issubdtype(h.dtype, np.integer))
 
-        h, b = histogram(a, normed=True)
+        h, b = histogram(a, density=True)
         assert_(np.issubdtype(h.dtype, np.floating))
 
         h, b = histogram(a, weights=np.ones(10, int))
@@ -133,9 +145,9 @@ class TestHistogram(object):
         v = np.random.rand(100)
         w = np.ones(100) * 5
         a, b = histogram(v)
-        na, nb = histogram(v, normed=True)
+        na, nb = histogram(v, density=True)
         wa, wb = histogram(v, weights=w)
-        nwa, nwb = histogram(v, weights=w, normed=True)
+        nwa, nwb = histogram(v, weights=w, density=True)
         assert_array_almost_equal(a * 5, wa)
         assert_array_almost_equal(na, nwa)
 
@@ -149,7 +161,7 @@ class TestHistogram(object):
         wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1])
         assert_array_equal(wa, [4, 5, 0, 1])
         wa, wb = histogram(
-            [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], normed=True)
+            [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], density=True)
         assert_array_almost_equal(wa, np.array([4, 5, 0, 1]) / 10. / 3. * 4)
 
         # Check weights with non-uniform bin widths
@@ -535,13 +547,13 @@ class TestHistogramdd(object):
 
         # Check normalization
         ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]]
-        H, edges = histogramdd(x, bins=ed, normed=True)
+        H, edges = histogramdd(x, bins=ed, density=True)
         assert_(np.all(H == answer / 12.))
 
         # Check that H has the correct shape.
         H, edges = histogramdd(x, (2, 3, 4),
                                range=[[-1, 1], [0, 3], [0, 4]],
-                               normed=True)
+                               density=True)
         answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]],
                            [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]])
         assert_array_almost_equal(H, answer / 6., 4)
@@ -587,10 +599,10 @@ class TestHistogramdd(object):
     def test_weights(self):
         v = np.random.rand(100, 2)
         hist, edges = histogramdd(v)
-        n_hist, edges = histogramdd(v, normed=True)
+        n_hist, edges = histogramdd(v, density=True)
         w_hist, edges = histogramdd(v, weights=np.ones(100))
         assert_array_equal(w_hist, hist)
-        w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, normed=True)
+        w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, density=True)
         assert_array_equal(w_hist, n_hist)
         w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2)
         assert_array_equal(w_hist, 2 * hist)
@@ -695,3 +707,39 @@ class TestHistogramdd(object):
         hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
 
         assert_equal(hist[0, 0], 1)
+
+    def test_density_non_uniform_2d(self):
+        # Defines the following grid:
+        #
+        #    0 2     8
+        #   0+-+-----+
+        #    + |     +
+        #    + |     +
+        #   6+-+-----+
+        #   8+-+-----+
+        x_edges = np.array([0, 2, 8])
+        y_edges = np.array([0, 6, 8])
+        relative_areas = np.array([
+            [3, 9],
+            [1, 3]])
+
+        # ensure the number of points in each region is proportional to its area
+        x = np.array([1] + [1]*3 + [7]*3 + [7]*9)
+        y = np.array([7] + [1]*3 + [7]*3 + [1]*9)
+
+        # sanity check that the above worked as intended
+        hist, edges = histogramdd((y, x), bins=(y_edges, x_edges))
+        assert_equal(hist, relative_areas)
+
+        # resulting histogram should be uniform, since counts and areas are propotional
+        hist, edges = histogramdd((y, x), bins=(y_edges, x_edges), density=True)
+        assert_equal(hist, 1 / (8*8))
+
+    def test_density_non_uniform_1d(self):
+        # compare to histogram to show the results are the same
+        v = np.arange(10)
+        bins = np.array([0, 1, 3, 6, 10])
+        hist, edges = histogram(v, bins, density=True)
+        hist_dd, edges_dd = histogramdd((v,), (bins,), density=True)
+        assert_equal(hist, hist_dd)
+        assert_equal(edges, edges_dd[0])
diff --git a/numpy/lib/tests/test_polynomial.py b/numpy/lib/tests/test_polynomial.py
index 7f6fca4a4..9f7c117a2 100644
--- a/numpy/lib/tests/test_polynomial.py
+++ b/numpy/lib/tests/test_polynomial.py
@@ -1,93 +1,79 @@
-'''
->>> p = np.poly1d([1.,2,3])
->>> p
-poly1d([1., 2., 3.])
->>> print(p)
-   2
-1 x + 2 x + 3
->>> q = np.poly1d([3.,2,1])
->>> q
-poly1d([3., 2., 1.])
->>> print(q)
-   2
-3 x + 2 x + 1
->>> print(np.poly1d([1.89999+2j, -3j, -5.12345678, 2+1j]))
-            3      2
-(1.9 + 2j) x - 3j x - 5.123 x + (2 + 1j)
->>> print(np.poly1d([-3, -2, -1]))
-    2
--3 x - 2 x - 1
-
->>> p(0)
-3.0
->>> p(5)
-38.0
->>> q(0)
-1.0
->>> q(5)
-86.0
-
->>> p * q
-poly1d([ 3.,  8., 14.,  8.,  3.])
->>> p / q
-(poly1d([0.33333333]), poly1d([1.33333333, 2.66666667]))
->>> p + q
-poly1d([4., 4., 4.])
->>> p - q
-poly1d([-2.,  0.,  2.])
->>> p ** 4
-poly1d([  1.,   8.,  36., 104., 214., 312., 324., 216.,  81.])
-
->>> p(q)
-poly1d([ 9., 12., 16.,  8.,  6.])
->>> q(p)
-poly1d([ 3., 12., 32., 40., 34.])
-
->>> np.asarray(p)
-array([1., 2., 3.])
->>> len(p)
-2
-
->>> p[0], p[1], p[2], p[3]
-(3.0, 2.0, 1.0, 0)
-
->>> p.integ()
-poly1d([0.33333333, 1.        , 3.        , 0.        ])
->>> p.integ(1)
-poly1d([0.33333333, 1.        , 3.        , 0.        ])
->>> p.integ(5)
-poly1d([0.00039683, 0.00277778, 0.025     , 0.        , 0.        ,
-       0.        , 0.        , 0.        ])
->>> p.deriv()
-poly1d([2., 2.])
->>> p.deriv(2)
-poly1d([2.])
-
->>> q = np.poly1d([1.,2,3], variable='y')
->>> print(q)
-   2
-1 y + 2 y + 3
->>> q = np.poly1d([1.,2,3], variable='lambda')
->>> print(q)
-        2
-1 lambda + 2 lambda + 3
-
->>> np.polydiv(np.poly1d([1,0,-1]), np.poly1d([1,1]))
-(poly1d([ 1., -1.]), poly1d([0.]))
-
-'''
 from __future__ import division, absolute_import, print_function
 
 import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_almost_equal,
-    assert_array_almost_equal, assert_raises, rundocs
+    assert_array_almost_equal, assert_raises
     )
 
 
-class TestDocs(object):
-    def test_doctests(self):
-        return rundocs()
+class TestPolynomial(object):
+    def test_poly1d_str_and_repr(self):
+        p = np.poly1d([1., 2, 3])
+        assert_equal(repr(p), 'poly1d([1., 2., 3.])')
+        assert_equal(str(p),
+                     '   2\n'
+                     '1 x + 2 x + 3')
+
+        q = np.poly1d([3., 2, 1])
+        assert_equal(repr(q), 'poly1d([3., 2., 1.])')
+        assert_equal(str(q),
+                     '   2\n'
+                     '3 x + 2 x + 1')
+
+        r = np.poly1d([1.89999 + 2j, -3j, -5.12345678, 2 + 1j])
+        assert_equal(str(r),
+                     '            3      2\n'
+                     '(1.9 + 2j) x - 3j x - 5.123 x + (2 + 1j)')
+
+        assert_equal(str(np.poly1d([-3, -2, -1])),
+                     '    2\n'
+                     '-3 x - 2 x - 1')
+
+    def test_poly1d_resolution(self):
+        p = np.poly1d([1., 2, 3])
+        q = np.poly1d([3., 2, 1])
+        assert_equal(p(0), 3.0)
+        assert_equal(p(5), 38.0)
+        assert_equal(q(0), 1.0)
+        assert_equal(q(5), 86.0)
+
+    def test_poly1d_math(self):
+        # here we use some simple coeffs to make calculations easier
+        p = np.poly1d([1., 2, 4])
+        q = np.poly1d([4., 2, 1])
+        assert_equal(p/q, (np.poly1d([0.25]), np.poly1d([1.5, 3.75])))
+        assert_equal(p.integ(), np.poly1d([1/3, 1., 4., 0.]))
+        assert_equal(p.integ(1), np.poly1d([1/3, 1., 4., 0.]))
+
+        p = np.poly1d([1., 2, 3])
+        q = np.poly1d([3., 2, 1])
+        assert_equal(p * q, np.poly1d([3., 8., 14., 8., 3.]))
+        assert_equal(p + q, np.poly1d([4., 4., 4.]))
+        assert_equal(p - q, np.poly1d([-2., 0., 2.]))
+        assert_equal(p ** 4, np.poly1d([1., 8., 36., 104., 214., 312., 324., 216., 81.]))
+        assert_equal(p(q), np.poly1d([9., 12., 16., 8., 6.]))
+        assert_equal(q(p), np.poly1d([3., 12., 32., 40., 34.]))
+        assert_equal(p.deriv(), np.poly1d([2., 2.]))
+        assert_equal(p.deriv(2), np.poly1d([2.]))
+        assert_equal(np.polydiv(np.poly1d([1, 0, -1]), np.poly1d([1, 1])),
+                     (np.poly1d([1., -1.]), np.poly1d([0.])))
+
+    def test_poly1d_misc(self):
+        p = np.poly1d([1., 2, 3])
+        assert_equal(np.asarray(p), np.array([1., 2., 3.]))
+        assert_equal(len(p), 2)
+        assert_equal((p[0], p[1], p[2], p[3]), (3.0, 2.0, 1.0, 0))
+
+    def test_poly1d_variable_arg(self):
+        q = np.poly1d([1., 2, 3], variable='y')
+        assert_equal(str(q),
+                     '   2\n'
+                     '1 y + 2 y + 3')
+        q = np.poly1d([1., 2, 3], variable='lambda')
+        assert_equal(str(q),
+                     '        2\n'
+                     '1 lambda + 2 lambda + 3')
 
     def test_poly(self):
         assert_array_almost_equal(np.poly([3, -np.sqrt(2), np.sqrt(2)]),
diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py
index 219ae24fa..d4828bc1f 100644
--- a/numpy/lib/tests/test_recfunctions.py
+++ b/numpy/lib/tests/test_recfunctions.py
@@ -9,8 +9,8 @@ from numpy.ma.testutils import assert_equal
 from numpy.testing import assert_, assert_raises
 from numpy.lib.recfunctions import (
     drop_fields, rename_fields, get_fieldstructure, recursive_fill_fields,
-    find_duplicates, merge_arrays, append_fields, stack_arrays, join_by
-    )
+    find_duplicates, merge_arrays, append_fields, stack_arrays, join_by,
+    repack_fields)
 get_names = np.lib.recfunctions.get_names
 get_names_flat = np.lib.recfunctions.get_names_flat
 zip_descr = np.lib.recfunctions.zip_descr
@@ -192,6 +192,18 @@ class TestRecFunctions(object):
         assert_equal(sorted(test[-1]), control)
         assert_equal(test[0], a[test[-1]])
 
+    def test_repack_fields(self):
+        dt = np.dtype('u1,f4,i8', align=True)
+        a = np.zeros(2, dtype=dt)
+
+        assert_equal(repack_fields(dt), np.dtype('u1,f4,i8'))
+        assert_equal(repack_fields(a).itemsize, 13)
+        assert_equal(repack_fields(repack_fields(dt), align=True), dt)
+
+        # make sure type is preserved
+        dt = np.dtype((np.record, dt))
+        assert_(repack_fields(dt).type is np.record)
+
 
 class TestRecursiveFillFields(object):
     # Test recursive_fill_fields.
diff --git a/numpy/lib/tests/test_shape_base.py b/numpy/lib/tests/test_shape_base.py
index c95894f94..6d24dd624 100644
--- a/numpy/lib/tests/test_shape_base.py
+++ b/numpy/lib/tests/test_shape_base.py
@@ -293,6 +293,15 @@ class TestExpandDims(object):
             assert_warns(DeprecationWarning, expand_dims, a, -6)
             assert_warns(DeprecationWarning, expand_dims, a, 5)
 
+    def test_subclasses(self):
+        a = np.arange(10).reshape((2, 5))
+        a = np.ma.array(a, mask=a%3 == 0)
+
+        expanded = np.expand_dims(a, axis=1)
+        assert_(isinstance(expanded, np.ma.MaskedArray))
+        assert_equal(expanded.shape, (2, 1, 5))
+        assert_equal(expanded.mask.shape, (2, 1, 5))
+
 
 class TestArraySplit(object):
     def test_integer_0_split(self):
diff --git a/numpy/lib/tests/test_twodim_base.py b/numpy/lib/tests/test_twodim_base.py
index d3a072af3..bf93b4adb 100644
--- a/numpy/lib/tests/test_twodim_base.py
+++ b/numpy/lib/tests/test_twodim_base.py
@@ -208,7 +208,7 @@ class TestHistogram2d(object):
         x = array([1, 1, 2, 3, 4, 4, 4, 5])
         y = array([1, 3, 2, 0, 1, 2, 3, 4])
         H, xed, yed = histogram2d(
-            x, y, (6, 5), range=[[0, 6], [0, 5]], normed=True)
+            x, y, (6, 5), range=[[0, 6], [0, 5]], density=True)
         answer = array(
             [[0., 0, 0, 0, 0],
              [0, 1, 0, 1, 0],
@@ -220,11 +220,11 @@ class TestHistogram2d(object):
         assert_array_equal(xed, np.linspace(0, 6, 7))
         assert_array_equal(yed, np.linspace(0, 5, 6))
 
-    def test_norm(self):
+    def test_density(self):
         x = array([1, 2, 3, 1, 2, 3, 1, 2, 3])
         y = array([1, 1, 1, 2, 2, 2, 3, 3, 3])
         H, xed, yed = histogram2d(
-            x, y, [[1, 2, 3, 5], [1, 2, 3, 5]], normed=True)
+            x, y, [[1, 2, 3, 5], [1, 2, 3, 5]], density=True)
         answer = array([[1, 1, .5],
                         [1, 1, .5],
                         [.5, .5, .25]])/9.
diff --git a/numpy/lib/tests/test_ufunclike.py b/numpy/lib/tests/test_ufunclike.py
index 5604b3744..0f06876a1 100644
--- a/numpy/lib/tests/test_ufunclike.py
+++ b/numpy/lib/tests/test_ufunclike.py
@@ -4,8 +4,8 @@ import numpy as np
 import numpy.core as nx
 import numpy.lib.ufunclike as ufl
 from numpy.testing import (
-    assert_, assert_equal, assert_array_equal, assert_warns
-    )
+    assert_, assert_equal, assert_array_equal, assert_warns, assert_raises
+)
 
 
 class TestUfunclike(object):
@@ -21,6 +21,10 @@ class TestUfunclike(object):
         assert_equal(res, tgt)
         assert_equal(out, tgt)
 
+        a = a.astype(np.complex)
+        with assert_raises(TypeError):
+            ufl.isposinf(a)
+
     def test_isneginf(self):
         a = nx.array([nx.inf, -nx.inf, nx.nan, 0.0, 3.0, -3.0])
         out = nx.zeros(a.shape, bool)
@@ -32,6 +36,10 @@ class TestUfunclike(object):
         assert_equal(res, tgt)
         assert_equal(out, tgt)
 
+        a = a.astype(np.complex)
+        with assert_raises(TypeError):
+            ufl.isneginf(a)
+
     def test_fix(self):
         a = nx.array([[1.0, 1.1, 1.5, 1.8], [-1.0, -1.1, -1.5, -1.8]])
         out = nx.zeros(a.shape, float)
@@ -52,7 +60,8 @@ class TestUfunclike(object):
                 return res
 
             def __array_wrap__(self, obj, context=None):
-                obj.metadata = self.metadata
+                if isinstance(obj, MyArray):
+                    obj.metadata = self.metadata
                 return obj
 
             def __array_finalize__(self, obj):
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index cca316e9a..98efba191 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -530,7 +530,8 @@ def vander(x, N=None, increasing=False):
     return v
 
 
-def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
+def histogram2d(x, y, bins=10, range=None, normed=None, weights=None,
+                density=None):
     """
     Compute the bi-dimensional histogram of two data samples.
 
@@ -560,9 +561,14 @@ def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
         (if not specified explicitly in the `bins` parameters):
         ``[[xmin, xmax], [ymin, ymax]]``. All values outside of this range
         will be considered outliers and not tallied in the histogram.
+    density : bool, optional
+        If False, the default, returns the number of samples in each bin.
+        If True, returns the probability *density* function at the bin,
+        ``bin_count / sample_count / bin_area``.
     normed : bool, optional
-        If False, returns the number of samples in each bin. If True,
-        returns the bin density ``bin_count / sample_count / bin_area``.
+        An alias for the density argument that behaves identically. To avoid
+        confusion with the broken normed argument to `histogram`, `density`
+        should be preferred.
     weights : array_like, shape(N,), optional
         An array of values ``w_i`` weighing each sample ``(x_i, y_i)``.
         Weights are normalized to 1 if `normed` is True. If `normed` is
@@ -652,7 +658,7 @@ def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
     if N != 1 and N != 2:
         xedges = yedges = asarray(bins)
         bins = [xedges, yedges]
-    hist, edges = histogramdd([x, y], bins, range, normed, weights)
+    hist, edges = histogramdd([x, y], bins, range, normed, weights, density)
     return hist, edges[0], edges[1]
 
 
diff --git a/numpy/lib/type_check.py b/numpy/lib/type_check.py
index 1664e6ebb..3f7aa32fa 100644
--- a/numpy/lib/type_check.py
+++ b/numpy/lib/type_check.py
@@ -215,7 +215,7 @@ def iscomplex(x):
     if issubclass(ax.dtype.type, _nx.complexfloating):
         return ax.imag != 0
     res = zeros(ax.shape, bool)
-    return +res  # convert to array-scalar if needed
+    return res[()]   # convert to scalar if needed
 
 def isreal(x):
     """
diff --git a/numpy/lib/ufunclike.py b/numpy/lib/ufunclike.py
index e0bd95182..6259c5445 100644
--- a/numpy/lib/ufunclike.py
+++ b/numpy/lib/ufunclike.py
@@ -11,6 +11,7 @@ import numpy.core.numeric as nx
 import warnings
 import functools
 
+
 def _deprecate_out_named_y(f):
     """
     Allow the out argument to be passed as the name `y` (deprecated)
@@ -81,6 +82,7 @@ def fix(x, out=None):
         res = res[()]
     return res
 
+
 @_deprecate_out_named_y
 def isposinf(x, out=None):
     """
@@ -116,8 +118,9 @@ def isposinf(x, out=None):
     NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic
     (IEEE 754).
 
-    Errors result if the second argument is also supplied when `x` is a
-    scalar input, or if first and second arguments have different shapes.
+    Errors result if the second argument is also supplied when x is a scalar
+    input, if first and second arguments have different shapes, or if the
+    first argument has complex values
 
     Examples
     --------
@@ -138,7 +141,14 @@ def isposinf(x, out=None):
     array([0, 0, 1])
 
     """
-    return nx.logical_and(nx.isinf(x), ~nx.signbit(x), out)
+    is_inf = nx.isinf(x)
+    try:
+        signbit = ~nx.signbit(x)
+    except TypeError:
+        raise TypeError('This operation is not supported for complex values '
+                        'because it would be ambiguous.')
+    else:
+        return nx.logical_and(is_inf, signbit, out)
 
 
 @_deprecate_out_named_y
@@ -178,7 +188,8 @@ def isneginf(x, out=None):
     (IEEE 754).
 
     Errors result if the second argument is also supplied when x is a scalar
-    input, or if first and second arguments have different shapes.
+    input, if first and second arguments have different shapes, or if the
+    first argument has complex values.
 
     Examples
     --------
@@ -199,4 +210,11 @@ def isneginf(x, out=None):
     array([1, 0, 0])
 
     """
-    return nx.logical_and(nx.isinf(x), nx.signbit(x), out)
+    is_inf = nx.isinf(x)
+    try:
+        signbit = nx.signbit(x)
+    except TypeError:
+        raise TypeError('This operation is not supported for complex values '
+                        'because it would be ambiguous.')
+    else:
+        return nx.logical_and(is_inf, signbit, out)
diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py
index 1ecd334af..9678bab76 100644
--- a/numpy/lib/utils.py
+++ b/numpy/lib/utils.py
@@ -982,12 +982,12 @@ def _getmembers(item):
 #-----------------------------------------------------------------------------
 
 # The following SafeEval class and company are adapted from Michael Spencer's
-# ASPN Python Cookbook recipe:
-#   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/364469
+# ASPN Python Cookbook recipe: https://code.activestate.com/recipes/364469/
+#
 # Accordingly it is mostly Copyright 2006 by Michael Spencer.
 # The recipe, like most of the other ASPN Python Cookbook recipes was made
 # available under the Python license.
-#   http://www.python.org/license
+#   https://en.wikipedia.org/wiki/Python_License
 
 # It has been modified to:
 #   * handle unary -/+
diff --git a/numpy/linalg/__init__.py b/numpy/linalg/__init__.py
index 37bd27574..4b696c883 100644
--- a/numpy/linalg/__init__.py
+++ b/numpy/linalg/__init__.py
@@ -50,6 +50,6 @@ from .info import __doc__
 
 from .linalg import *
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 98af0733b..c3b76ada7 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -780,7 +780,7 @@ def qr(a, mode='reduced'):
     dorgqr, and zungqr.
 
     For more information on the qr factorization, see for example:
-    http://en.wikipedia.org/wiki/QR_factorization
+    https://en.wikipedia.org/wiki/QR_factorization
 
     Subclasses of `ndarray` are preserved except for the 'raw' mode. So if
     `a` is of type `matrix`, all the return values will be matrices too.
@@ -858,13 +858,13 @@ def qr(a, mode='reduced'):
 
     a, wrap = _makearray(a)
     _assertRank2(a)
-    _assertNoEmpty2d(a)
     m, n = a.shape
     t, result_t = _commonType(a)
     a = _fastCopyAndTranspose(t, a)
     a = _to_native_byte_order(a)
     mn = min(m, n)
     tau = zeros((mn,), t)
+
     if isComplexType(t):
         lapack_routine = lapack_lite.zgeqrf
         routine_name = 'zgeqrf'
@@ -875,14 +875,14 @@ def qr(a, mode='reduced'):
     # calculate optimal size of work data 'work'
     lwork = 1
     work = zeros((lwork,), t)
-    results = lapack_routine(m, n, a, m, tau, work, -1, 0)
+    results = lapack_routine(m, n, a, max(1, m), tau, work, -1, 0)
     if results['info'] != 0:
         raise LinAlgError('%s returns %d' % (routine_name, results['info']))
 
     # do qr decomposition
-    lwork = int(abs(work[0]))
+    lwork = max(1, n, int(abs(work[0])))
     work = zeros((lwork,), t)
-    results = lapack_routine(m, n, a, m, tau, work, lwork, 0)
+    results = lapack_routine(m, n, a, max(1, m), tau, work, lwork, 0)
     if results['info'] != 0:
         raise LinAlgError('%s returns %d' % (routine_name, results['info']))
 
@@ -918,14 +918,14 @@ def qr(a, mode='reduced'):
     # determine optimal lwork
     lwork = 1
     work = zeros((lwork,), t)
-    results = lapack_routine(m, mc, mn, q, m, tau, work, -1, 0)
+    results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, -1, 0)
     if results['info'] != 0:
         raise LinAlgError('%s returns %d' % (routine_name, results['info']))
 
     # compute q
-    lwork = int(abs(work[0]))
+    lwork = max(1, n, int(abs(work[0])))
     work = zeros((lwork,), t)
-    results = lapack_routine(m, mc, mn, q, m, tau, work, lwork, 0)
+    results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, lwork, 0)
     if results['info'] != 0:
         raise LinAlgError('%s returns %d' % (routine_name, results['info']))
 
@@ -965,8 +965,10 @@ def eigvals(a):
     See Also
     --------
     eig : eigenvalues and right eigenvectors of general arrays
-    eigvalsh : eigenvalues of symmetric or Hermitian arrays.
-    eigh : eigenvalues and eigenvectors of symmetric/Hermitian arrays.
+    eigvalsh : eigenvalues of real symmetric or complex Hermitian 
+               (conjugate symmetric) arrays.
+    eigh : eigenvalues and eigenvectors of real symmetric or complex
+           Hermitian (conjugate symmetric) arrays.
 
     Notes
     -----
@@ -1027,7 +1029,7 @@ def eigvals(a):
 
 def eigvalsh(a, UPLO='L'):
     """
-    Compute the eigenvalues of a Hermitian or real symmetric matrix.
+    Compute the eigenvalues of a complex Hermitian or real symmetric matrix.
 
     Main difference from eigh: the eigenvectors are not computed.
 
@@ -1057,7 +1059,8 @@ def eigvalsh(a, UPLO='L'):
 
     See Also
     --------
-    eigh : eigenvalues and eigenvectors of symmetric/Hermitian arrays.
+    eigh : eigenvalues and eigenvectors of real symmetric or complex Hermitian
+           (conjugate symmetric) arrays.
     eigvals : eigenvalues of general real or complex arrays.
     eig : eigenvalues and right eigenvectors of general real or complex
           arrays.
@@ -1159,11 +1162,11 @@ def eig(a):
     --------
     eigvals : eigenvalues of a non-symmetric array.
 
-    eigh : eigenvalues and eigenvectors of a symmetric or Hermitian
-           (conjugate symmetric) array.
+    eigh : eigenvalues and eigenvectors of a real symmetric or complex 
+           Hermitian (conjugate symmetric) array.
 
-    eigvalsh : eigenvalues of a symmetric or Hermitian (conjugate symmetric)
-               array.
+    eigvalsh : eigenvalues of a real symmetric or complex Hermitian
+               (conjugate symmetric) array.
 
     Notes
     -----
@@ -1268,7 +1271,8 @@ def eig(a):
 
 def eigh(a, UPLO='L'):
     """
-    Return the eigenvalues and eigenvectors of a Hermitian or symmetric matrix.
+    Return the eigenvalues and eigenvectors of a complex Hermitian
+    (conjugate symmetric) or a real symmetric matrix.
 
     Returns two objects, a 1-D array containing the eigenvalues of `a`, and
     a 2-D square array or matrix (depending on the input type) of the
@@ -1277,7 +1281,7 @@ def eigh(a, UPLO='L'):
     Parameters
     ----------
     a : (..., M, M) array
-        Hermitian/Symmetric matrices whose eigenvalues and
+        Hermitian or real symmetric matrices whose eigenvalues and
         eigenvectors are to be computed.
     UPLO : {'L', 'U'}, optional
         Specifies whether the calculation is done with the lower triangular
@@ -1304,7 +1308,8 @@ def eigh(a, UPLO='L'):
 
     See Also
     --------
-    eigvalsh : eigenvalues of symmetric or Hermitian arrays.
+    eigvalsh : eigenvalues of real symmetric or complex Hermitian
+               (conjugate symmetric) arrays.
     eig : eigenvalues and right eigenvectors for non-symmetric arrays.
     eigvals : eigenvalues of non-symmetric arrays.
 
@@ -1527,7 +1532,6 @@ def svd(a, full_matrices=True, compute_uv=True):
 
     """
     a, wrap = _makearray(a)
-    _assertNoEmpty2d(a)
     _assertRankAtLeast2(a)
     t, result_t = _commonType(a)
 
@@ -1644,6 +1648,7 @@ def cond(x, p=None):
 
     """
     x = asarray(x)  # in case we have a matrix
+    _assertNoEmpty2d(x)
     if p is None or p == 2 or p == -2:
         s = svd(x, compute_uv=False)
         with errstate(all='ignore'):
@@ -1750,7 +1755,7 @@ def matrix_rank(M, tol=None, hermitian=False):
     References
     ----------
     .. [1] MATLAB reference documention, "Rank"
-           http://www.mathworks.com/help/techdoc/ref/rank.html
+           https://www.mathworks.com/help/techdoc/ref/rank.html
     .. [2] W. H. Press, S. A. Teukolsky, W. T. Vetterling and B. P. Flannery,
            "Numerical Recipes (3rd edition)", Cambridge University Press, 2007,
            page 795.
@@ -2468,7 +2473,7 @@ def multi_dot(arrays):
     ----------
 
     .. [1] Cormen, "Introduction to Algorithms", Chapter 15.2, p. 370-378
-    .. [2] http://en.wikipedia.org/wiki/Matrix_chain_multiplication
+    .. [2] https://en.wikipedia.org/wiki/Matrix_chain_multiplication
 
     Examples
     --------
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 87dfe988a..0df673884 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -644,10 +644,6 @@ class TestEig(EigCases):
 class SVDCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
-        if 'size-0' in tags:
-            assert_raises(LinAlgError, linalg.svd, a, 0)
-            return
-
         u, s, vt = linalg.svd(a, 0)
         assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :],
                                            np.asarray(vt)),
@@ -670,15 +666,19 @@ class TestSVD(SVDCases):
         for dtype in [single, double, csingle, cdouble]:
             check(dtype)
 
-    def test_0_size(self):
-        # These raise errors currently
-        # (which does not mean that it may not make sense)
-        a = np.zeros((0, 0), dtype=np.complex64)
-        assert_raises(linalg.LinAlgError, linalg.svd, a)
-        a = np.zeros((0, 1), dtype=np.complex64)
-        assert_raises(linalg.LinAlgError, linalg.svd, a)
-        a = np.zeros((1, 0), dtype=np.complex64)
-        assert_raises(linalg.LinAlgError, linalg.svd, a)
+    def test_empty_identity(self):
+        """ Empty input should put an identity matrix in u or vh """
+        x = np.empty((4, 0))
+        u, s, vh = linalg.svd(x, compute_uv=True)
+        assert_equal(u.shape, (4, 4))
+        assert_equal(vh.shape, (0, 0))
+        assert_equal(u, np.eye(4))
+
+        x = np.empty((0, 4))
+        u, s, vh = linalg.svd(x, compute_uv=True)
+        assert_equal(u.shape, (0, 0))
+        assert_equal(vh.shape, (4, 4))
+        assert_equal(vh, np.eye(4))
 
 
 class CondCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
@@ -1582,9 +1582,25 @@ class TestQR(object):
         assert_(isinstance(r2, a_type))
         assert_almost_equal(r2, r1)
 
-    def test_qr_empty(self):
-        a = np.zeros((0, 2))
-        assert_raises(linalg.LinAlgError, linalg.qr, a)
+
+    @pytest.mark.parametrize(["m", "n"], [
+        (3, 0),
+        (0, 3),
+        (0, 0)
+    ])
+    def test_qr_empty(self, m, n):
+        k = min(m, n)
+        a = np.empty((m, n))
+        a_type = type(a)
+        a_dtype = a.dtype
+
+        self.check_qr(a)
+
+        h, tau = np.linalg.qr(a, mode='raw')
+        assert_equal(h.dtype, np.double)
+        assert_equal(tau.dtype, np.double)
+        assert_equal(h.shape, (n, m))
+        assert_equal(tau.shape, (k,))
 
     def test_mode_raw(self):
         # The factorization is not unique and varies between libraries,
@@ -1625,15 +1641,6 @@ class TestQR(object):
             self.check_qr(m2)
             self.check_qr(m2.T)
 
-    def test_0_size(self):
-        # There may be good ways to do (some of this) reasonably:
-        a = np.zeros((0, 0))
-        assert_raises(linalg.LinAlgError, linalg.qr, a)
-        a = np.zeros((0, 1))
-        assert_raises(linalg.LinAlgError, linalg.qr, a)
-        a = np.zeros((1, 0))
-        assert_raises(linalg.LinAlgError, linalg.qr, a)
-
 
 class TestCholesky(object):
     # TODO: are there no other tests for cholesky?
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 7dc1cb0cb..9fc68a7aa 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -2735,19 +2735,18 @@ static NPY_INLINE void
                            (fortran_int)dimensions[0],
                            (fortran_int)dimensions[1])) {
         LINEARIZE_DATA_t a_in, u_out, s_out, v_out;
+        fortran_int min_m_n = params.M < params.N ? params.M : params.N;
 
         init_linearize_data(&a_in, params.N, params.M, steps[1], steps[0]);
         if ('N' == params.JOBZ) {
             /* only the singular values are wanted */
-            fortran_int min_m_n = params.M < params.N? params.M : params.N;
             init_linearize_data(&s_out, 1, min_m_n, 0, steps[2]);
         } else {
             fortran_int u_columns, v_rows;
-            fortran_int min_m_n = params.M < params.N? params.M : params.N;
             if ('S' == params.JOBZ) {
                 u_columns = min_m_n;
                 v_rows = min_m_n;
-            } else {
+            } else { /* JOBZ == 'A' */
                 u_columns = params.M;
                 v_rows = params.N;
             }
@@ -2771,6 +2770,15 @@ static NPY_INLINE void
                 if ('N' == params.JOBZ) {
                     delinearize_@REALTYPE@_matrix(args[1], params.S, &s_out);
                 } else {
+                    if ('A' == params.JOBZ && min_m_n == 0) {
+                        /* Lapack has betrayed us and left these uninitialized,
+                         * so produce an identity matrix for whichever of u
+                         * and v is not empty.
+                         */
+                        identity_@TYPE@_matrix(params.U, params.M);
+                        identity_@TYPE@_matrix(params.VT, params.N);
+                    }
+
                     delinearize_@TYPE@_matrix(args[1], params.U, &u_out);
                     delinearize_@REALTYPE@_matrix(args[2], params.S, &s_out);
                     delinearize_@TYPE@_matrix(args[3], params.VT, &v_out);
diff --git a/numpy/ma/README.txt b/numpy/ma/README.txt
index ef9635e57..47f20d645 100644
--- a/numpy/ma/README.txt
+++ b/numpy/ma/README.txt
@@ -4,7 +4,7 @@ A Guide to Masked Arrays in NumPy
 
 .. Contents::
 
-See http://www.scipy.org/scipy/numpy/wiki/MaskedArray
+See http://www.scipy.org/scipy/numpy/wiki/MaskedArray (dead link)
 for updates of this document.
 
 
@@ -18,7 +18,7 @@ that could store some additional information along with numerical values,
 while keeping the possibility for missing data (picture storing a series
 of dates along with measurements, what would later become the `TimeSeries
 Scikit <http://projects.scipy.org/scipy/scikits/wiki/TimeSeries>`__
-.
+(dead link).
 
 I started to implement such a class, but then quickly realized that
 any additional information disappeared when processing these subarrays
diff --git a/numpy/ma/__init__.py b/numpy/ma/__init__.py
index 34f21b8b1..36ceb1f6e 100644
--- a/numpy/ma/__init__.py
+++ b/numpy/ma/__init__.py
@@ -51,6 +51,6 @@ __all__ = ['core', 'extras']
 __all__ += core.__all__
 __all__ += extras.__all__
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index d4c4c4437..65ce967ae 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -43,7 +43,7 @@ from numpy.lib.function_base import angle
 from numpy.compat import (
     getargspec, formatargspec, long, basestring, unicode, bytes
     )
-from numpy import expand_dims as n_expand_dims
+from numpy import expand_dims
 from numpy.core.multiarray import normalize_axis_index
 from numpy.core.numeric import normalize_axis_tuple
 
@@ -2319,8 +2319,10 @@ def masked_values(x, value, rtol=1e-5, atol=1e-8, copy=True, shrink=True):
         mask = np.isclose(xnew, value, atol=atol, rtol=rtol)
     else:
         mask = umath.equal(xnew, value)
-    return masked_array(
-        xnew, mask=mask, copy=copy, fill_value=value, shrink=shrink)
+    ret = masked_array(xnew, mask=mask, copy=copy, fill_value=value)
+    if shrink:
+        ret.shrink_mask()
+    return ret
 
 
 def masked_invalid(a, copy=True):
@@ -6792,56 +6794,6 @@ def diag(v, k=0):
     return output
 
 
-def expand_dims(x, axis):
-    """
-    Expand the shape of an array.
-
-    Expands the shape of the array by including a new axis before the one
-    specified by the `axis` parameter. This function behaves the same as
-    `numpy.expand_dims` but preserves masked elements.
-
-    See Also
-    --------
-    numpy.expand_dims : Equivalent function in top-level NumPy module.
-
-    Examples
-    --------
-    >>> import numpy.ma as ma
-    >>> x = ma.array([1, 2, 4])
-    >>> x[1] = ma.masked
-    >>> x
-    masked_array(data = [1 -- 4],
-                 mask = [False  True False],
-           fill_value = 999999)
-    >>> np.expand_dims(x, axis=0)
-    array([[1, 2, 4]])
-    >>> ma.expand_dims(x, axis=0)
-    masked_array(data =
-     [[1 -- 4]],
-                 mask =
-     [[False  True False]],
-           fill_value = 999999)
-
-    The same result can be achieved using slicing syntax with `np.newaxis`.
-
-    >>> x[np.newaxis, :]
-    masked_array(data =
-     [[1 -- 4]],
-                 mask =
-     [[False  True False]],
-           fill_value = 999999)
-
-    """
-    result = n_expand_dims(x, axis)
-    if isinstance(x, MaskedArray):
-        new_shape = result.shape
-        result = x.view()
-        result.shape = new_shape
-        if result._mask is not nomask:
-            result._mask.shape = new_shape
-    return result
-
-
 def left_shift(a, n):
     """
     Shift the bits of an integer to the left.
@@ -7112,32 +7064,32 @@ size.__doc__ = np.size.__doc__
 
 def where(condition, x=_NoValue, y=_NoValue):
     """
-    Return a masked array with elements from x or y, depending on condition.
+    Return a masked array with elements from `x` or `y`, depending on condition.
 
-    Returns a masked array, shaped like condition, where the elements
-    are from `x` when `condition` is True, and from `y` otherwise.
-    If neither `x` nor `y` are given, the function returns a tuple of
-    indices where `condition` is True (the result of
-    ``condition.nonzero()``).
+    .. note::
+        When only `condition` is provided, this function is identical to
+        `nonzero`. The rest of this documentation covers only the case where
+        all three arguments are provided.
 
     Parameters
     ----------
     condition : array_like, bool
-        The condition to meet. For each True element, yield the corresponding
-        element from `x`, otherwise from `y`.
+        Where True, yield `x`, otherwise yield `y`. 
     x, y : array_like, optional
         Values from which to choose. `x`, `y` and `condition` need to be
         broadcastable to some shape.
 
     Returns
     -------
-    out : MaskedArray or tuple of ndarrays
-        The resulting masked array if `x` and `y` were given, otherwise
-        the result of ``condition.nonzero()``.
+    out : MaskedArray
+        An masked array with `masked` elements where the condition is masked,
+        elements from `x` where `condition` is True, and elements from `y`
+        elsewhere.
 
     See Also
     --------
     numpy.where : Equivalent function in the top-level NumPy module.
+    nonzero : The function that is called when x and y are omitted
 
     Examples
     --------
@@ -7148,9 +7100,6 @@ def where(condition, x=_NoValue, y=_NoValue):
     [[0.0 -- 2.0]
      [-- 4.0 --]
      [6.0 -- 8.0]]
-    >>> np.ma.where(x > 5)    # return the indices where x > 5
-    (array([2, 2]), array([0, 2]))
-
     >>> print(np.ma.where(x > 5, x, -3.1416))
     [[-3.1416 -- -3.1416]
      [-- -3.1416 --]
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index b086ec69c..129809b5d 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -4904,6 +4904,12 @@ class TestMaskedWhereAliases(object):
         res = np.ma.masked_values(np.inf, -np.inf)
         assert_equal(res.mask, False)
 
+        res = np.ma.masked_values([1, 2, 3, 4], 5, shrink=True)
+        assert_(res.mask is np.ma.nomask)
+
+        res = np.ma.masked_values([1, 2, 3, 4], 5, shrink=False)
+        assert_equal(res.mask, [False] * 4)
+
 
 def test_masked_array():
     a = np.ma.array([0, 1, 2, 3], mask=[0, 0, 1, 0])
diff --git a/numpy/matrixlib/__init__.py b/numpy/matrixlib/__init__.py
index 3ad3a9549..777e0cd33 100644
--- a/numpy/matrixlib/__init__.py
+++ b/numpy/matrixlib/__init__.py
@@ -7,6 +7,6 @@ from .defmatrix import *
 
 __all__ = defmatrix.__all__
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/matrixlib/tests/test_defmatrix.py b/numpy/matrixlib/tests/test_defmatrix.py
index e74e83cdb..272cd8d52 100644
--- a/numpy/matrixlib/tests/test_defmatrix.py
+++ b/numpy/matrixlib/tests/test_defmatrix.py
@@ -466,3 +466,11 @@ class TestShape(object):
     def test_matrix_memory_sharing(self):
         assert_(np.may_share_memory(self.m, self.m.ravel()))
         assert_(not np.may_share_memory(self.m, self.m.flatten()))
+
+    def test_expand_dims_matrix(self):
+        # matrices are always 2d - so expand_dims only makes sense when the
+        # type is changed away from matrix.
+        a = np.arange(10).reshape((2, 5)).view(np.matrix)
+        expanded = np.expand_dims(a, axis=1)
+        assert_equal(expanded.ndim, 3)
+        assert_(not isinstance(expanded, np.matrix))
diff --git a/numpy/polynomial/__init__.py b/numpy/polynomial/__init__.py
index c18bebedb..85cee9ce6 100644
--- a/numpy/polynomial/__init__.py
+++ b/numpy/polynomial/__init__.py
@@ -22,6 +22,6 @@ from .hermite import Hermite
 from .hermite_e import HermiteE
 from .laguerre import Laguerre
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/polynomial/chebyshev.py b/numpy/polynomial/chebyshev.py
index 8add0acbc..310c711ef 100644
--- a/numpy/polynomial/chebyshev.py
+++ b/numpy/polynomial/chebyshev.py
@@ -83,7 +83,7 @@ References
 ----------
 .. [1] A. T. Benjamin, et al., "Combinatorial Trigonometry with Chebyshev
   Polynomials," *Journal of Statistical Planning and Inference 14*, 2008
-  (preprint: http://www.math.hmc.edu/~benjamin/papers/CombTrig.pdf, pg. 4)
+  (preprint: https://www.math.hmc.edu/~benjamin/papers/CombTrig.pdf, pg. 4)
 
 """
 from __future__ import division, absolute_import, print_function
@@ -365,7 +365,7 @@ def poly2cheb(pol):
     >>> c = p.convert(kind=P.Chebyshev)
     >>> c
     Chebyshev([ 1.  ,  3.25,  1.  ,  0.75], domain=[-1,  1], window=[-1,  1])
-    >>> P.poly2cheb(range(4))
+    >>> P.chebyshev.poly2cheb(range(4))
     array([ 1.  ,  3.25,  1.  ,  0.75])
 
     """
@@ -417,7 +417,7 @@ def cheb2poly(c):
     >>> p = c.convert(kind=P.Polynomial)
     >>> p
     Polynomial([ -2.,  -8.,   4.,  12.], [-1.,  1.])
-    >>> P.cheb2poly(range(4))
+    >>> P.chebyshev.cheb2poly(range(4))
     array([ -2.,  -8.,   4.,  12.])
 
     """
@@ -1708,7 +1708,7 @@ def chebfit(x, y, deg, rcond=None, full=False, w=None):
     References
     ----------
     .. [1] Wikipedia, "Curve fitting",
-           http://en.wikipedia.org/wiki/Curve_fitting
+           https://en.wikipedia.org/wiki/Curve_fitting
 
     Examples
     --------
diff --git a/numpy/polynomial/hermite.py b/numpy/polynomial/hermite.py
index 58e9e180f..75c7e6832 100644
--- a/numpy/polynomial/hermite.py
+++ b/numpy/polynomial/hermite.py
@@ -1476,7 +1476,7 @@ def hermfit(x, y, deg, rcond=None, full=False, w=None):
     References
     ----------
     .. [1] Wikipedia, "Curve fitting",
-           http://en.wikipedia.org/wiki/Curve_fitting
+           https://en.wikipedia.org/wiki/Curve_fitting
 
     Examples
     --------
diff --git a/numpy/polynomial/hermite_e.py b/numpy/polynomial/hermite_e.py
index 47b2a9fb4..125364a11 100644
--- a/numpy/polynomial/hermite_e.py
+++ b/numpy/polynomial/hermite_e.py
@@ -1473,7 +1473,7 @@ def hermefit(x, y, deg, rcond=None, full=False, w=None):
     References
     ----------
     .. [1] Wikipedia, "Curve fitting",
-           http://en.wikipedia.org/wiki/Curve_fitting
+           https://en.wikipedia.org/wiki/Curve_fitting
 
     Examples
     --------
diff --git a/numpy/polynomial/laguerre.py b/numpy/polynomial/laguerre.py
index 5a9a5111a..2b9757ab8 100644
--- a/numpy/polynomial/laguerre.py
+++ b/numpy/polynomial/laguerre.py
@@ -1475,7 +1475,7 @@ def lagfit(x, y, deg, rcond=None, full=False, w=None):
     References
     ----------
     .. [1] Wikipedia, "Curve fitting",
-           http://en.wikipedia.org/wiki/Curve_fitting
+           https://en.wikipedia.org/wiki/Curve_fitting
 
     Examples
     --------
diff --git a/numpy/polynomial/legendre.py b/numpy/polynomial/legendre.py
index 0d4a49afc..a83c5735f 100644
--- a/numpy/polynomial/legendre.py
+++ b/numpy/polynomial/legendre.py
@@ -1509,7 +1509,7 @@ def legfit(x, y, deg, rcond=None, full=False, w=None):
     References
     ----------
     .. [1] Wikipedia, "Curve fitting",
-           http://en.wikipedia.org/wiki/Curve_fitting
+           https://en.wikipedia.org/wiki/Curve_fitting
 
     Examples
     --------
diff --git a/numpy/random/__init__.py b/numpy/random/__init__.py
index 81cb94cc1..82aefce5f 100644
--- a/numpy/random/__init__.py
+++ b/numpy/random/__init__.py
@@ -117,6 +117,6 @@ def __RandomState_ctor():
     """
     return RandomState(seed=0)
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx
index b45b3146f..ec759fdfb 100644
--- a/numpy/random/mtrand/mtrand.pyx
+++ b/numpy/random/mtrand/mtrand.pyx
@@ -22,8 +22,8 @@
 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 include "Python.pxi"
-include "randint_helpers.pxi"
 include "numpy.pxd"
+include "randint_helpers.pxi"
 include "cpython/pycapsule.pxd"
 
 from libc cimport string
@@ -573,21 +573,21 @@ def _shape_from_size(size, d):
            shape = tuple(size) + (d,)
     return shape
 
-# Look up table for randint functions keyed by type name. The stored data
-# is a tuple (lbnd, ubnd, func), where lbnd is the smallest value for the
-# type, ubnd is one greater than the largest value, and func is the
+# Look up table for randint functions keyed by dtype.
+# The stored data is a tuple (lbnd, ubnd, func), where lbnd is the smallest
+# value for the type, ubnd is one greater than the largest value, and func is the
 # function to call.
 _randint_type = {
-    'bool': (0, 2, _rand_bool),
-    'int8': (-2**7, 2**7, _rand_int8),
-    'int16': (-2**15, 2**15, _rand_int16),
-    'int32': (-2**31, 2**31, _rand_int32),
-    'int64': (-2**63, 2**63, _rand_int64),
-    'uint8': (0, 2**8, _rand_uint8),
-    'uint16': (0, 2**16, _rand_uint16),
-    'uint32': (0, 2**32, _rand_uint32),
-    'uint64': (0, 2**64, _rand_uint64)
-    }
+    np.dtype(np.bool_): (0, 2, _rand_bool),
+    np.dtype(np.int8): (-2**7, 2**7, _rand_int8),
+    np.dtype(np.int16): (-2**15, 2**15, _rand_int16),
+    np.dtype(np.int32): (-2**31, 2**31, _rand_int32),
+    np.dtype(np.int64): (-2**63, 2**63, _rand_int64),
+    np.dtype(np.uint8): (0, 2**8, _rand_uint8),
+    np.dtype(np.uint16): (0, 2**16, _rand_uint16),
+    np.dtype(np.uint32): (0, 2**32, _rand_uint32),
+    np.dtype(np.uint64): (0, 2**64, _rand_uint64)
+}
 
 
 cdef class RandomState:
@@ -969,13 +969,12 @@ cdef class RandomState:
             high = low
             low = 0
 
-        # '_randint_type' is defined in
-        # 'generate_randint_helpers.py'
-        key = np.dtype(dtype).name
-        if key not in _randint_type:
-            raise TypeError('Unsupported dtype "%s" for randint' % key)
-
-        lowbnd, highbnd, randfunc = _randint_type[key]
+        raw_dtype = dtype
+        dtype = np.dtype(dtype)
+        try:
+            lowbnd, highbnd, randfunc = _randint_type[dtype]
+        except KeyError:
+            raise TypeError('Unsupported dtype "%s" for randint' % dtype)
 
         # TODO: Do not cast these inputs to Python int
         #
@@ -986,20 +985,20 @@ cdef class RandomState:
         ihigh = int(high)
 
         if ilow < lowbnd:
-            raise ValueError("low is out of bounds for %s" % (key,))
+            raise ValueError("low is out of bounds for %s" % dtype)
         if ihigh > highbnd:
-            raise ValueError("high is out of bounds for %s" % (key,))
-        if ilow >= ihigh:
-            raise ValueError("low >= high")
-
+            raise ValueError("high is out of bounds for %s" % dtype)
+        if ilow >= ihigh and np.prod(size) != 0:
+            raise ValueError("Range cannot be empty (low >= high) unless no samples are taken")
+ 
         with self.lock:
             ret = randfunc(ilow, ihigh - 1, size, self.state_address)
 
-            if size is None:
-                if dtype in (np.bool, np.int, np.long):
-                    return dtype(ret)
+        # back-compat: keep python scalars when a python type is passed
+        if size is None and raw_dtype in (bool, int, np.long):
+            return raw_dtype(ret)
 
-            return ret
+        return ret
 
     def bytes(self, npy_intp length):
         """
@@ -1115,15 +1114,15 @@ cdef class RandomState:
                 # __index__ must return an integer by python rules.
                 pop_size = operator.index(a.item())
             except TypeError:
-                raise ValueError("a must be 1-dimensional or an integer")
-            if pop_size <= 0:
-                raise ValueError("a must be greater than 0")
+                raise ValueError("'a' must be 1-dimensional or an integer")
+            if pop_size <= 0 and np.prod(size) != 0:
+                raise ValueError("'a' must be greater than 0 unless no samples are taken")
         elif a.ndim != 1:
-            raise ValueError("a must be 1-dimensional")
+            raise ValueError("'a' must be 1-dimensional")
         else:
             pop_size = a.shape[0]
-            if pop_size is 0:
-                raise ValueError("a must be non-empty")
+            if pop_size is 0 and np.prod(size) != 0:
+                raise ValueError("'a' cannot be empty unless no samples are taken")
 
         if p is not None:
             d = len(p)
@@ -1137,9 +1136,9 @@ cdef class RandomState:
             pix = <double*>PyArray_DATA(p)
 
             if p.ndim != 1:
-                raise ValueError("p must be 1-dimensional")
+                raise ValueError("'p' must be 1-dimensional")
             if p.size != pop_size:
-                raise ValueError("a and p must have same size")
+                raise ValueError("'a' and 'p' must have same size")
             if np.logical_or.reduce(p < 0):
                 raise ValueError("probabilities are not non-negative")
             if abs(kahan_sum(pix, d) - 1.) > atol:
@@ -1607,7 +1606,7 @@ cdef class RandomState:
         References
         ----------
         .. [1] Wikipedia, "Normal distribution",
-               http://en.wikipedia.org/wiki/Normal_distribution
+               https://en.wikipedia.org/wiki/Normal_distribution
         .. [2] P. R. Peebles Jr., "Central Limit Theorem" in "Probability,
                Random Variables and Random Signal Principles", 4th ed., 2001,
                pp. 51, 51, 125.
@@ -1759,9 +1758,9 @@ cdef class RandomState:
         .. [1] Peyton Z. Peebles Jr., "Probability, Random Variables and
                Random Signal Principles", 4th ed, 2001, p. 57.
         .. [2] Wikipedia, "Poisson process",
-               http://en.wikipedia.org/wiki/Poisson_process
+               https://en.wikipedia.org/wiki/Poisson_process
         .. [3] Wikipedia, "Exponential distribution",
-               http://en.wikipedia.org/wiki/Exponential_distribution
+               https://en.wikipedia.org/wiki/Exponential_distribution
 
         """
         cdef ndarray oscale
@@ -1860,7 +1859,7 @@ cdef class RandomState:
                Wolfram Web Resource.
                http://mathworld.wolfram.com/GammaDistribution.html
         .. [2] Wikipedia, "Gamma distribution",
-               http://en.wikipedia.org/wiki/Gamma_distribution
+               https://en.wikipedia.org/wiki/Gamma_distribution
 
         Examples
         --------
@@ -1950,7 +1949,7 @@ cdef class RandomState:
                Wolfram Web Resource.
                http://mathworld.wolfram.com/GammaDistribution.html
         .. [2] Wikipedia, "Gamma distribution",
-               http://en.wikipedia.org/wiki/Gamma_distribution
+               https://en.wikipedia.org/wiki/Gamma_distribution
 
         Examples
         --------
@@ -2047,7 +2046,7 @@ cdef class RandomState:
         .. [1] Glantz, Stanton A. "Primer of Biostatistics.", McGraw-Hill,
                Fifth Edition, 2002.
         .. [2] Wikipedia, "F-distribution",
-               http://en.wikipedia.org/wiki/F-distribution
+               https://en.wikipedia.org/wiki/F-distribution
 
         Examples
         --------
@@ -2150,7 +2149,7 @@ cdef class RandomState:
                From MathWorld--A Wolfram Web Resource.
                http://mathworld.wolfram.com/NoncentralF-Distribution.html
         .. [2] Wikipedia, "Noncentral F-distribution",
-               http://en.wikipedia.org/wiki/Noncentral_F-distribution
+               https://en.wikipedia.org/wiki/Noncentral_F-distribution
 
         Examples
         --------
@@ -2257,7 +2256,7 @@ cdef class RandomState:
         References
         ----------
         .. [1] NIST "Engineering Statistics Handbook"
-               http://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm
+               https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm
 
         Examples
         --------
@@ -2333,8 +2332,8 @@ cdef class RandomState:
         .. [1] Delhi, M.S. Holla, "On a noncentral chi-square distribution in
                the analysis of weapon systems effectiveness", Metrika,
                Volume 15, Number 1 / December, 1970.
-        .. [2] Wikipedia, "Noncentral chi-square distribution"
-               http://en.wikipedia.org/wiki/Noncentral_chi-square_distribution
+        .. [2] Wikipedia, "Noncentral chi-squared distribution"
+               https://en.wikipedia.org/wiki/Noncentral_chi-squared_distribution
 
         Examples
         --------
@@ -2433,12 +2432,12 @@ cdef class RandomState:
         ----------
         .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "Cauchy
               Distribution",
-              http://www.itl.nist.gov/div898/handbook/eda/section3/eda3663.htm
+              https://www.itl.nist.gov/div898/handbook/eda/section3/eda3663.htm
         .. [2] Weisstein, Eric W. "Cauchy Distribution." From MathWorld--A
               Wolfram Web Resource.
               http://mathworld.wolfram.com/CauchyDistribution.html
         .. [3] Wikipedia, "Cauchy distribution"
-              http://en.wikipedia.org/wiki/Cauchy_distribution
+              https://en.wikipedia.org/wiki/Cauchy_distribution
 
         Examples
         --------
@@ -2501,7 +2500,7 @@ cdef class RandomState:
         .. [1] Dalgaard, Peter, "Introductory Statistics With R",
                Springer, 2002.
         .. [2] Wikipedia, "Student's t-distribution"
-               http://en.wikipedia.org/wiki/Student's_t-distribution
+               https://en.wikipedia.org/wiki/Student's_t-distribution
 
         Examples
         --------
@@ -2731,7 +2730,7 @@ cdef class RandomState:
         .. [3] Reiss, R.D., Thomas, M.(2001), Statistical Analysis of Extreme
                Values, Birkhauser Verlag, Basel, pp 23-30.
         .. [4] Wikipedia, "Pareto distribution",
-               http://en.wikipedia.org/wiki/Pareto_distribution
+               https://en.wikipedia.org/wiki/Pareto_distribution
 
         Examples
         --------
@@ -2836,7 +2835,7 @@ cdef class RandomState:
                Wide Applicability", Journal Of Applied Mechanics ASME Paper
                1951.
         .. [3] Wikipedia, "Weibull distribution",
-               http://en.wikipedia.org/wiki/Weibull_distribution
+               https://en.wikipedia.org/wiki/Weibull_distribution
 
         Examples
         --------
@@ -2927,7 +2926,7 @@ cdef class RandomState:
                Dataplot Reference Manual, Volume 2: Let Subcommands and Library
                Functions", National Institute of Standards and Technology
                Handbook Series, June 2003.
-               http://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/powpdf.pdf
+               https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/powpdf.pdf
 
         Examples
         --------
@@ -3042,7 +3041,7 @@ cdef class RandomState:
                From MathWorld--A Wolfram Web Resource.
                http://mathworld.wolfram.com/LaplaceDistribution.html
         .. [4] Wikipedia, "Laplace distribution",
-               http://en.wikipedia.org/wiki/Laplace_distribution
+               https://en.wikipedia.org/wiki/Laplace_distribution
 
         Examples
         --------
@@ -3272,7 +3271,7 @@ cdef class RandomState:
                MathWorld--A Wolfram Web Resource.
                http://mathworld.wolfram.com/LogisticDistribution.html
         .. [3] Wikipedia, "Logistic-distribution",
-               http://en.wikipedia.org/wiki/Logistic_distribution
+               https://en.wikipedia.org/wiki/Logistic_distribution
 
         Examples
         --------
@@ -3366,7 +3365,7 @@ cdef class RandomState:
         .. [1] Limpert, E., Stahel, W. A., and Abbt, M., "Log-normal
                Distributions across the Sciences: Keys and Clues,"
                BioScience, Vol. 51, No. 5, May, 2001.
-               http://stat.ethz.ch/~stahel/lognormal/bioscience.pdf
+               https://stat.ethz.ch/~stahel/lognormal/bioscience.pdf
         .. [2] Reiss, R.D. and Thomas, M., "Statistical Analysis of Extreme
                Values," Basel: Birkhauser Verlag, 2001, pp. 31-32.
 
@@ -3472,9 +3471,9 @@ cdef class RandomState:
         References
         ----------
         .. [1] Brighton Webs Ltd., "Rayleigh Distribution,"
-               http://www.brighton-webs.co.uk/distributions/rayleigh.asp
+               https://web.archive.org/web/20090514091424/http://brighton-webs.co.uk:80/distributions/rayleigh.asp
         .. [2] Wikipedia, "Rayleigh distribution"
-               http://en.wikipedia.org/wiki/Rayleigh_distribution
+               https://en.wikipedia.org/wiki/Rayleigh_distribution
 
         Examples
         --------
@@ -3560,12 +3559,12 @@ cdef class RandomState:
         References
         ----------
         .. [1] Brighton Webs Ltd., Wald Distribution,
-               http://www.brighton-webs.co.uk/distributions/wald.asp
+               https://web.archive.org/web/20090423014010/http://www.brighton-webs.co.uk:80/distributions/wald.asp
         .. [2] Chhikara, Raj S., and Folks, J. Leroy, "The Inverse Gaussian
                Distribution: Theory : Methodology, and Applications", CRC Press,
                1988.
-        .. [3] Wikipedia, "Wald distribution"
-               http://en.wikipedia.org/wiki/Wald_distribution
+        .. [3] Wikipedia, "Inverse Gaussian distribution"
+               https://en.wikipedia.org/wiki/Inverse_Gaussian_distribution
 
         Examples
         --------
@@ -3651,7 +3650,7 @@ cdef class RandomState:
         References
         ----------
         .. [1] Wikipedia, "Triangular distribution"
-               http://en.wikipedia.org/wiki/Triangular_distribution
+               https://en.wikipedia.org/wiki/Triangular_distribution
 
         Examples
         --------
@@ -3758,7 +3757,7 @@ cdef class RandomState:
                Wolfram Web Resource.
                http://mathworld.wolfram.com/BinomialDistribution.html
         .. [5] Wikipedia, "Binomial distribution",
-               http://en.wikipedia.org/wiki/Binomial_distribution
+               https://en.wikipedia.org/wiki/Binomial_distribution
 
         Examples
         --------
@@ -3861,7 +3860,7 @@ cdef class RandomState:
                MathWorld--A Wolfram Web Resource.
                http://mathworld.wolfram.com/NegativeBinomialDistribution.html
         .. [2] Wikipedia, "Negative binomial distribution",
-               http://en.wikipedia.org/wiki/Negative_binomial_distribution
+               https://en.wikipedia.org/wiki/Negative_binomial_distribution
 
         Examples
         --------
@@ -3955,7 +3954,7 @@ cdef class RandomState:
                From MathWorld--A Wolfram Web Resource.
                http://mathworld.wolfram.com/PoissonDistribution.html
         .. [2] Wikipedia, "Poisson distribution",
-               http://en.wikipedia.org/wiki/Poisson_distribution
+               https://en.wikipedia.org/wiki/Poisson_distribution
 
         Examples
         --------
@@ -4225,7 +4224,7 @@ cdef class RandomState:
                MathWorld--A Wolfram Web Resource.
                http://mathworld.wolfram.com/HypergeometricDistribution.html
         .. [3] Wikipedia, "Hypergeometric distribution",
-               http://en.wikipedia.org/wiki/Hypergeometric_distribution
+               https://en.wikipedia.org/wiki/Hypergeometric_distribution
 
         Examples
         --------
@@ -4335,7 +4334,7 @@ cdef class RandomState:
         .. [3] D. J. Hand, F. Daly, D. Lunn, E. Ostrowski, A Handbook of Small
                Data Sets, CRC Press, 1994.
         .. [4] Wikipedia, "Logarithmic distribution",
-               http://en.wikipedia.org/wiki/Logarithmic_distribution
+               https://en.wikipedia.org/wiki/Logarithmic_distribution
 
         Examples
         --------
@@ -4697,9 +4696,9 @@ cdef class RandomState:
         ----------
         .. [1] David McKay, "Information Theory, Inference and Learning
                Algorithms," chapter 23,
-               http://www.inference.phy.cam.ac.uk/mackay/
+               http://www.inference.org.uk/mackay/itila/
         .. [2] Wikipedia, "Dirichlet distribution",
-               http://en.wikipedia.org/wiki/Dirichlet_distribution
+               https://en.wikipedia.org/wiki/Dirichlet_distribution
 
         Examples
         --------
diff --git a/numpy/random/mtrand/randint_helpers.pxi.in b/numpy/random/mtrand/randint_helpers.pxi.in
index 4bd7cd356..894a25167 100644
--- a/numpy/random/mtrand/randint_helpers.pxi.in
+++ b/numpy/random/mtrand/randint_helpers.pxi.in
@@ -23,7 +23,7 @@ def get_dispatch(dtypes):
 
 {{for npy_dt, npy_udt, np_dt in get_dispatch(dtypes)}}
 
-def _rand_{{npy_dt}}(low, high, size, rngstate):
+def _rand_{{npy_dt}}(npy_{{npy_dt}} low, npy_{{npy_dt}} high, size, rngstate):
     """
     _rand_{{npy_dt}}(low, high, size, rngstate)
 
@@ -60,8 +60,8 @@ def _rand_{{npy_dt}}(low, high, size, rngstate):
     cdef npy_intp cnt
     cdef rk_state *state = <rk_state *>PyCapsule_GetPointer(rngstate, NULL)
 
-    rng = <npy_{{npy_udt}}>(high - low)
-    off = <npy_{{npy_udt}}>(<npy_{{npy_dt}}>low)
+    off = <npy_{{npy_udt}}>(low)
+    rng = <npy_{{npy_udt}}>(high) - <npy_{{npy_udt}}>(low)
 
     if size is None:
         rk_random_{{npy_udt}}(off, rng, 1, &buf, state)
diff --git a/numpy/random/mtrand/randomkit.c b/numpy/random/mtrand/randomkit.c
index 380917180..6371ebe33 100644
--- a/numpy/random/mtrand/randomkit.c
+++ b/numpy/random/mtrand/randomkit.c
@@ -616,7 +616,7 @@ rk_gauss(rk_state *state)
         }
         while (r2 >= 1.0 || r2 == 0.0);
 
-        /* Box-Muller transform */
+        /* Polar method, a more efficient version of the Box-Muller approach. */
         f = sqrt(-2.0*log(r2)/r2);
         /* Keep for next call */
         state->gauss = f*x1;
diff --git a/numpy/random/tests/test_random.py b/numpy/random/tests/test_random.py
index 61c6e912d..2e0885024 100644
--- a/numpy/random/tests/test_random.py
+++ b/numpy/random/tests/test_random.py
@@ -440,6 +440,15 @@ class TestRandomDist(object):
         assert_equal(np.random.choice(6, s, replace=False, p=p).shape, s)
         assert_equal(np.random.choice(np.arange(6), s, replace=True).shape, s)
 
+        # Check zero-size
+        assert_equal(np.random.randint(0, 0, size=(3, 0, 4)).shape, (3, 0, 4))
+        assert_equal(np.random.randint(0, -10, size=0).shape, (0,))
+        assert_equal(np.random.randint(10, 10, size=0).shape, (0,))
+        assert_equal(np.random.choice(0, size=0).shape, (0,))
+        assert_equal(np.random.choice([], size=(0,)).shape, (0,))
+        assert_equal(np.random.choice(['a', 'b'], size=(3, 0, 4)).shape, (3, 0, 4))
+        assert_raises(ValueError, np.random.choice, [], 10)
+
     def test_bytes(self):
         np.random.seed(self.seed)
         actual = np.random.bytes(10)
@@ -759,7 +768,7 @@ class TestRandomDist(object):
                  [1.40840323350391515e+02, 1.98390255135251704e+05]])
         # For some reason on 32-bit x86 Ubuntu 12.10 the [1, 0] entry in this
         # matrix differs by 24 nulps. Discussion:
-        #   http://mail.python.org/pipermail/numpy-discussion/2012-September/063801.html
+        #   https://mail.python.org/pipermail/numpy-discussion/2012-September/063801.html
         # Consensus is that this is probably some gcc quirk that affects
         # rounding but not in any important way, so we just use a looser
         # tolerance on this test:
diff --git a/numpy/testing/__init__.py b/numpy/testing/__init__.py
index a7c85931c..a8bd4fc15 100644
--- a/numpy/testing/__init__.py
+++ b/numpy/testing/__init__.py
@@ -17,6 +17,6 @@ from ._private.nosetester import (
 
 __all__ = _private.utils.__all__ + ['TestCase', 'run_module_suite']
 
-from ._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/testing/_private/noseclasses.py b/numpy/testing/_private/noseclasses.py
index 08dec0ca9..e99bbc97d 100644
--- a/numpy/testing/_private/noseclasses.py
+++ b/numpy/testing/_private/noseclasses.py
@@ -26,7 +26,7 @@ from .utils import KnownFailureException, KnownFailureTest
 
 #-----------------------------------------------------------------------------
 # Modified version of the one in the stdlib, that fixes a python bug (doctests
-# not found in extension modules, http://bugs.python.org/issue3158)
+# not found in extension modules, https://bugs.python.org/issue3158)
 class NumpyDocTestFinder(doctest.DocTestFinder):
 
     def _from_module(self, module, object):
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index 032c4a116..0e2f8ba91 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -69,7 +69,7 @@ def import_nose():
 
     if not nose_is_good:
         msg = ('Need nose >= %d.%d.%d for tests - see '
-               'http://nose.readthedocs.io' %
+               'https://nose.readthedocs.io' %
                minimum_nose_version)
         raise ImportError(msg)
 
@@ -177,7 +177,7 @@ if os.name == 'nt':
         # thread's CPU usage is either 0 or 100).  To read counters like this,
         # you should copy this function, but keep the counter open, and call
         # CollectQueryData() each time you need to know.
-        # See http://msdn.microsoft.com/library/en-us/dnperfmo/html/perfmonpt2.asp
+        # See http://msdn.microsoft.com/library/en-us/dnperfmo/html/perfmonpt2.asp (dead link)
         # My older explanation for this was that the "AddCounter" process forced
         # the CPU to 100%, but the above makes more sense :)
         import win32pdh
@@ -1075,7 +1075,7 @@ def assert_string_equal(actual, desired):
         raise AssertionError(repr(type(actual)))
     if not isinstance(desired, str):
         raise AssertionError(repr(type(desired)))
-    if re.match(r'\A'+desired+r'\Z', actual, re.M):
+    if desired == actual:
         return
 
     diff = list(difflib.Differ().compare(actual.splitlines(1), desired.splitlines(1)))
@@ -1099,7 +1099,7 @@ def assert_string_equal(actual, desired):
                     l.append(d3)
                 else:
                     diff.insert(0, d3)
-            if re.match(r'\A'+d2[2:]+r'\Z', d1[2:]):
+            if d2[2:] == d1[2:]:
                 continue
             diff_list.extend(l)
             continue
@@ -1609,7 +1609,7 @@ def _integer_repr(x, vdt, comp):
     # Reinterpret binary representation of the float as sign-magnitude:
     # take into account two-complement representation
     # See also
-    # http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
+    # https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
     rx = x.view(vdt)
     if not (rx.size == 1):
         rx[rx < 0] = comp - rx[rx < 0]
@@ -1917,7 +1917,7 @@ class suppress_warnings(object):
     ``warnings.catch_warnings``.
 
     However, it also provides a filter mechanism to work around
-    http://bugs.python.org/issue4180.
+    https://bugs.python.org/issue4180.
 
     This bug causes Python before 3.4 to not reliably show warnings again
     after they have been ignored once (even within catch_warnings). It
diff --git a/numpy/testing/setup.py b/numpy/testing/setup.py
index e27a9b85b..7c3f2fbdf 100755
--- a/numpy/testing/setup.py
+++ b/numpy/testing/setup.py
@@ -15,7 +15,7 @@ if __name__ == '__main__':
     setup(maintainer="NumPy Developers",
           maintainer_email="numpy-dev@numpy.org",
           description="NumPy test module",
-          url="http://www.numpy.org",
+          url="https://www.numpy.org",
           license="NumPy License (BSD Style)",
           configuration=configuration,
           )
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 465c217d4..84d310992 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -1081,6 +1081,12 @@ class TestStringEqual(object):
 
         assert_raises(AssertionError,
                       lambda: assert_string_equal("foo", "hello"))
+        
+    def test_regex(self):
+        assert_string_equal("a+*b", "a+*b")
+
+        assert_raises(AssertionError,
+                      lambda: assert_string_equal("aaa", "a+b"))
 
 
 def assert_warn_len_equal(mod, n_in_context, py34=None, py37=None):