52 files changed, 1198 insertions, 520 deletions
diff --git a/doc/neps/index.rst.tmpl b/doc/neps/index.rst.tmpl
index 6c988014f..6cbad8eb2 100644
--- a/doc/neps/index.rst.tmpl
+++ b/doc/neps/index.rst.tmpl
@@ -15,7 +15,7 @@ Meta-NEPs (NEPs about NEPs or Processes)
    :maxdepth: 1
 
 {% for nep, tags in neps.items() if tags['Type'] == 'Process' %}
-   NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
+   {{ tags['Title'] }} <{{ tags['Filename'] }}>
 {% endfor %}
 
    nep-template
@@ -27,7 +27,7 @@ Accepted NEPs, implementation in progress
    :maxdepth: 1
 
 {% for nep, tags in neps.items() if tags['Status'] == 'Accepted' %}
-   NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
+   {{ tags['Title'] }} <{{ tags['Filename'] }}>
 {% endfor %}
 
 
@@ -38,7 +38,7 @@ Open NEPs (under consideration)
    :maxdepth: 1
 
 {% for nep, tags in neps.items() if tags['Status'] == 'Draft' %}
-   NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
+   {{ tags['Title'] }} <{{ tags['Filename'] }}>
 {% endfor %}
 
 
@@ -50,7 +50,7 @@ Implemented NEPs
    :maxdepth: 1
 
 {% for nep, tags in neps.items() if tags['Status'] == 'Final' %}
-   NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
+   {{ tags['Title'] }} <{{ tags['Filename'] }}>
 {% endfor %}
 
 Deferred NEPs
@@ -60,7 +60,7 @@ Deferred NEPs
    :maxdepth: 1
 
 {% for nep, tags in neps.items() if tags['Status'] == 'Deferred' %}
-   NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
+   {{ tags['Title'] }} <{{ tags['Filename'] }}>
 {% endfor %}
 
 Rejected NEPs
@@ -70,5 +70,5 @@ Rejected NEPs
    :maxdepth: 1
 
 {% for nep, tags in neps.items() if tags['Status'] == 'Rejected' %}
-   NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
+   {{ tags['Title'] }} <{{ tags['Filename'] }}>
 {% endfor %}
diff --git a/doc/neps/nep-0000.rst b/doc/neps/nep-0000.rst
index b451eeff7..a3ec3a42b 100644
--- a/doc/neps/nep-0000.rst
+++ b/doc/neps/nep-0000.rst
@@ -1,6 +1,6 @@
-===================
-Purpose and Process
-===================
+===========================
+NEP 0 — Purpose and Process
+===========================
 
 :Author: Jarrod Millman <millman@berkeley.edu>
 :Status: Active
diff --git a/doc/neps/nep-0001-npy-format.rst b/doc/neps/nep-0001-npy-format.rst
index 74512128d..4eded02ff 100644
--- a/doc/neps/nep-0001-npy-format.rst
+++ b/doc/neps/nep-0001-npy-format.rst
@@ -1,6 +1,6 @@
-=====================================
-A Simple File Format for NumPy Arrays
-=====================================
+=============================================
+NEP 1 — A Simple File Format for NumPy Arrays
+=============================================
 
 :Author: Robert Kern <robert.kern@gmail.com>
 :Status: Final
diff --git a/doc/neps/nep-0002-warnfix.rst b/doc/neps/nep-0002-warnfix.rst
index 60dc885b2..207dfa3d4 100644
--- a/doc/neps/nep-0002-warnfix.rst
+++ b/doc/neps/nep-0002-warnfix.rst
@@ -1,6 +1,6 @@
-=========================================================================
-A proposal to build numpy without warning with a big set of warning flags
-=========================================================================
+=================================================================================
+NEP 2 — A proposal to build numpy without warning with a big set of warning flags
+=================================================================================
 
 :Author: David Cournapeau
 :Contact: david@ar.media.kyoto-u.ac.jp
diff --git a/doc/neps/nep-0003-math_config_clean.rst b/doc/neps/nep-0003-math_config_clean.rst
index 5af907437..ebd32b124 100644
--- a/doc/neps/nep-0003-math_config_clean.rst
+++ b/doc/neps/nep-0003-math_config_clean.rst
@@ -1,6 +1,6 @@
-===========================================================
-Cleaning the math configuration of numpy.core
-===========================================================
+=====================================================
+NEP 3 — Cleaning the math configuration of numpy.core
+=====================================================
 
 :Author: David Cournapeau
 :Contact: david@ar.media.kyoto-u.ac.jp
diff --git a/doc/neps/nep-0004-datetime-proposal3.rst b/doc/neps/nep-0004-datetime-proposal3.rst
index afeb00d73..b32964e88 100644
--- a/doc/neps/nep-0004-datetime-proposal3.rst
+++ b/doc/neps/nep-0004-datetime-proposal3.rst
@@ -1,6 +1,6 @@
-====================================================================
- A (third) proposal for implementing some date/time types in NumPy
-====================================================================
+=========================================================================
+NEP 4 — A (third) proposal for implementing some date/time types in NumPy
+=========================================================================
 
 :Author: Francesc Alted i Abad
 :Contact: faltet@pytables.com
diff --git a/doc/neps/nep-0005-generalized-ufuncs.rst b/doc/neps/nep-0005-generalized-ufuncs.rst
index 54b2b370e..366e26ffd 100644
--- a/doc/neps/nep-0005-generalized-ufuncs.rst
+++ b/doc/neps/nep-0005-generalized-ufuncs.rst
@@ -1,6 +1,6 @@
-===============================
-Generalized Universal Functions
-===============================
+=======================================
+NEP 5 — Generalized Universal Functions
+=======================================
 
 :Status: Final
 
diff --git a/doc/neps/nep-0006-newbugtracker.rst b/doc/neps/nep-0006-newbugtracker.rst
index 2b9344ed0..8dc7a1d8e 100644
--- a/doc/neps/nep-0006-newbugtracker.rst
+++ b/doc/neps/nep-0006-newbugtracker.rst
@@ -1,6 +1,6 @@
-===========================================
-Replacing Trac with a different bug tracker
-===========================================
+===================================================
+NEP 6 — Replacing Trac with a different bug tracker
+===================================================
 
 :Author: David Cournapeau, Stefan van der Walt
 :Status: Deferred
diff --git a/doc/neps/nep-0007-datetime-proposal.rst b/doc/neps/nep-0007-datetime-proposal.rst
index 90894da49..5547a4306 100644
--- a/doc/neps/nep-0007-datetime-proposal.rst
+++ b/doc/neps/nep-0007-datetime-proposal.rst
@@ -1,6 +1,6 @@
-====================================================================
- A proposal for implementing some date/time types in NumPy
-====================================================================
+==================================================================
+NEP 7 — A proposal for implementing some date/time types in NumPy
+==================================================================
 
 :Author: Travis Oliphant
 :Contact: oliphant@enthought.com
diff --git a/doc/neps/nep-0008-groupby_additions.rst b/doc/neps/nep-0008-groupby_additions.rst
index fa02f2f9c..3189fcf41 100644
--- a/doc/neps/nep-0008-groupby_additions.rst
+++ b/doc/neps/nep-0008-groupby_additions.rst
@@ -1,6 +1,6 @@
-====================================================================
- A proposal for adding groupby functionality to NumPy
-====================================================================
+=============================================================
+NEP 8 —  A proposal for adding groupby functionality to NumPy
+=============================================================
 
 :Author: Travis Oliphant
 :Contact: oliphant@enthought.com
diff --git a/doc/neps/nep-0009-structured_array_extensions.rst b/doc/neps/nep-0009-structured_array_extensions.rst
index 695d0d516..8b81a308d 100644
--- a/doc/neps/nep-0009-structured_array_extensions.rst
+++ b/doc/neps/nep-0009-structured_array_extensions.rst
@@ -1,6 +1,6 @@
-===========================
-Structured array extensions
-===========================
+===================================
+NEP 9 — Structured array extensions
+===================================
 
 :Status: Deferred
 
diff --git a/doc/neps/nep-0010-new-iterator-ufunc.rst b/doc/neps/nep-0010-new-iterator-ufunc.rst
index 7b388a974..8601b4a4c 100644
--- a/doc/neps/nep-0010-new-iterator-ufunc.rst
+++ b/doc/neps/nep-0010-new-iterator-ufunc.rst
@@ -1,6 +1,6 @@
-=====================================
-Optimizing Iterator/UFunc Performance
-=====================================
+==============================================
+NEP 10 — Optimizing Iterator/UFunc Performance
+==============================================
 
 :Author: Mark Wiebe <mwwiebe@gmail.com>
 :Content-Type: text/x-rst
diff --git a/doc/neps/nep-0011-deferred-ufunc-evaluation.rst b/doc/neps/nep-0011-deferred-ufunc-evaluation.rst
index 5f5de3518..a7143c6ee 100644
--- a/doc/neps/nep-0011-deferred-ufunc-evaluation.rst
+++ b/doc/neps/nep-0011-deferred-ufunc-evaluation.rst
@@ -1,6 +1,6 @@
-=========================
-Deferred UFunc Evaluation
-=========================
+==================================
+NEP 11 — Deferred UFunc Evaluation
+==================================
 
 :Author: Mark Wiebe <mwwiebe@gmail.com>
 :Content-Type: text/x-rst
diff --git a/doc/neps/nep-0012-missing-data.rst b/doc/neps/nep-0012-missing-data.rst
index 57c45b4b6..dbcf1b579 100644
--- a/doc/neps/nep-0012-missing-data.rst
+++ b/doc/neps/nep-0012-missing-data.rst
@@ -1,6 +1,6 @@
-===================================
-Missing Data Functionality in NumPy
-===================================
+============================================
+NEP 12 — Missing Data Functionality in NumPy
+============================================
 
 :Author: Mark Wiebe <mwwiebe@gmail.com>
 :Copyright: Copyright 2011 by Enthought, Inc
diff --git a/doc/neps/nep-0013-ufunc-overrides.rst b/doc/neps/nep-0013-ufunc-overrides.rst
index 61e2ceea9..a51ce3927 100644
--- a/doc/neps/nep-0013-ufunc-overrides.rst
+++ b/doc/neps/nep-0013-ufunc-overrides.rst
@@ -1,6 +1,6 @@
-=================================
-A Mechanism for Overriding Ufuncs
-=================================
+==========================================
+NEP 13 — A Mechanism for Overriding Ufuncs
+==========================================
 
 .. currentmodule:: numpy
 
diff --git a/doc/neps/nep-0014-dropping-python2.7-proposal.rst b/doc/neps/nep-0014-dropping-python2.7-proposal.rst
index 158b89e1c..3adf3b407 100644
--- a/doc/neps/nep-0014-dropping-python2.7-proposal.rst
+++ b/doc/neps/nep-0014-dropping-python2.7-proposal.rst
@@ -1,6 +1,6 @@
-====================================
-Plan for dropping Python 2.7 support
-====================================
+=============================================
+NEP 14 — Plan for dropping Python 2.7 support
+=============================================
 
 :Status: Accepted
 :Resolution: https://mail.python.org/pipermail/numpy-discussion/2017-November/077419.html
diff --git a/doc/neps/nep-0015-merge-multiarray-umath.rst b/doc/neps/nep-0015-merge-multiarray-umath.rst
index 17852220f..5e605a04f 100644
--- a/doc/neps/nep-0015-merge-multiarray-umath.rst
+++ b/doc/neps/nep-0015-merge-multiarray-umath.rst
@@ -1,6 +1,6 @@
-============================
-Merging multiarray and umath
-============================
+=====================================
+NEP 15 — Merging multiarray and umath
+=====================================
 
 :Author: Nathaniel J. Smith <njs@pobox.com>
 :Status: Draft
diff --git a/doc/neps/nep-0017-split-out-maskedarray.rst b/doc/neps/nep-0017-split-out-maskedarray.rst
index d6dcc1def..7ef949763 100644
--- a/doc/neps/nep-0017-split-out-maskedarray.rst
+++ b/doc/neps/nep-0017-split-out-maskedarray.rst
@@ -1,6 +1,6 @@
-=======================
-Split Out Masked Arrays
-=======================
+================================
+NEP 17 — Split Out Masked Arrays
+================================
 
 :Author: Stéfan van der Walt <stefanv@berkeley.edu>
 :Status: Rejected
diff --git a/doc/neps/nep-0018-array-function-protocol.rst b/doc/neps/nep-0018-array-function-protocol.rst
index 58a2833e6..3e23a2e28 100644
--- a/doc/neps/nep-0018-array-function-protocol.rst
+++ b/doc/neps/nep-0018-array-function-protocol.rst
@@ -1,6 +1,6 @@
-===========================================================
-A dispatch mechanism for NumPy's high level array functions
-===========================================================
+====================================================================
+NEP 18 — A dispatch mechanism for NumPy's high level array functions
+====================================================================
 
 :Author: Stephan Hoyer <shoyer@google.com>
 :Author: Matthew Rocklin <mrocklin@gmail.com>
diff --git a/doc/neps/nep-0019-rng-policy.rst b/doc/neps/nep-0019-rng-policy.rst
index a2cc80262..fe389e5d5 100644
--- a/doc/neps/nep-0019-rng-policy.rst
+++ b/doc/neps/nep-0019-rng-policy.rst
@@ -1,6 +1,6 @@
-==============================
-Random Number Generator Policy
-==============================
+=======================================
+NEP 19 — Random Number Generator Policy
+=======================================
 
 :Author: Robert Kern <robert.kern@gmail.com>
 :Status: Draft
@@ -169,14 +169,16 @@ context of small unit tests.
 
 The new PRNG subsystem MUST provide a second, legacy distributions class that
 uses the same implementations of the distribution methods as the current
-version of ``numpy.random.RandomState``.  The methods of this class will keep
-the same strict stream-compatibility guarantees.  It is intended that this
-class will no longer be modified, except to keep it working when numpy
-internals change.  All new development should go into the primary distributions
-class.  The purpose of ``RandomState`` will be documented as providing certain
-fixed functionality for backwards compatibility and stable numbers for the
-limited purpose of unit testing, and not making whole programs reproducible
-across numpy versions.
+version of ``numpy.random.RandomState``.  The methods of this class will have
+strict stream-compatibility guarantees, even stricter than the current policy.
+It is intended that this class will no longer be modified, except to keep it
+working when numpy internals change.  All new development should go into the
+primary distributions class.  Bug fixes that change the stream SHALL NOT be
+made to ``RandomState``; instead, buggy distributions should be made to warn
+when they are buggy.  The purpose of ``RandomState`` will be documented as
+providing certain fixed functionality for backwards compatibility and stable
+numbers for the limited purpose of unit testing, and not making whole programs
+reproducible across numpy versions.
 
 This legacy distributions class MUST be accessible under the name
 ``numpy.random.RandomState`` for backwards compatibility.  All current ways of
diff --git a/doc/neps/nep-0020-gufunc-signature-enhancement.rst b/doc/neps/nep-0020-gufunc-signature-enhancement.rst
index 903ee60cb..be7eecbf3 100644
--- a/doc/neps/nep-0020-gufunc-signature-enhancement.rst
+++ b/doc/neps/nep-0020-gufunc-signature-enhancement.rst
@@ -1,6 +1,6 @@
-======================================================
-Expansion of Generalized Universal Function Signatures
-======================================================
+===============================================================
+NEP 20 — Expansion of Generalized Universal Function Signatures
+===============================================================
 
 :Author: Marten van Kerkwijk <mhvk@astro.utoronto.ca>
 :Status: Draft
diff --git a/doc/neps/nep-0021-advanced-indexing.rst b/doc/neps/nep-0021-advanced-indexing.rst
index 0279146be..d883a5589 100644
--- a/doc/neps/nep-0021-advanced-indexing.rst
+++ b/doc/neps/nep-0021-advanced-indexing.rst
@@ -1,6 +1,6 @@
-=========================================
-Simplified and explicit advanced indexing
-=========================================
+==================================================
+NEP 21 — Simplified and explicit advanced indexing
+==================================================
 
 :Author: Sebastian Berg
 :Author: Stephan Hoyer <shoyer@google.com>
diff --git a/doc/neps/nep-0022-ndarray-duck-typing-overview.rst b/doc/neps/nep-0022-ndarray-duck-typing-overview.rst
new file mode 100644
index 000000000..04e4a14b7
--- /dev/null
+++ b/doc/neps/nep-0022-ndarray-duck-typing-overview.rst
@@ -0,0 +1,351 @@
+===========================================================
+NEP 22 — Duck typing for NumPy arrays – high level overview
+===========================================================
+
+:Author: Stephan Hoyer <shoyer@google.com>, Nathaniel J. Smith <njs@pobox.com>
+:Status: Draft
+:Type: Informational
+:Created: 2018-03-22
+
+Abstract
+--------
+
+We outline a high-level vision for how NumPy will approach handling
+“duck arrays”. This is an Informational-class NEP; it doesn’t
+prescribe full details for any particular implementation. In brief, we
+propose developing a number of new protocols for defining
+implementations of multi-dimensional arrays with high-level APIs
+matching NumPy.
+
+
+Detailed description
+--------------------
+
+Traditionally, NumPy’s ``ndarray`` objects have provided two things: a
+high level API for expression operations on homogenously-typed,
+arbitrary-dimensional, array-structured data, and a concrete
+implementation of the API based on strided in-RAM storage. The API is
+powerful, fairly general, and used ubiquitously across the scientific
+Python stack. The concrete implementation, on the other hand, is
+suitable for a wide range of uses, but has limitations: as data sets
+grow and NumPy becomes used in a variety of new environments, there
+are increasingly cases where the strided in-RAM storage strategy is
+inappropriate, and users find they need sparse arrays, lazily
+evaluated arrays (as in dask), compressed arrays (as in blosc), arrays
+stored in GPU memory, arrays stored in alternative formats such as
+Arrow, and so forth – yet users still want to work with these arrays
+using the familiar NumPy APIs, and re-use existing code with minimal
+(ideally zero) porting overhead. As a working shorthand, we call these
+“duck arrays”, by analogy with Python’s “duck typing”: a “duck array”
+is a Python object which “quacks like” a numpy array in the sense that
+it has the same or similar Python API, but doesn’t share the C-level
+implementation.
+
+This NEP doesn’t propose any specific changes to NumPy or other
+projects; instead, it gives an overview of how we hope to extend NumPy
+to support a robust ecosystem of projects implementing and relying
+upon its high level API.
+
+Terminology
+~~~~~~~~~~~
+
+“Duck array” works fine as a placeholder for now, but it’s pretty
+jargony and may confuse new users, so we may want to pick something
+else for the actual API functions. Unfortunately, “array-like” is
+already taken for the concept of “anything that can be coerced into an
+array” (including e.g. list objects), and “anyarray” is already taken
+for the concept of “something that shares ndarray’s implementation,
+but has different semantics”, which is the opposite of a duck array
+(e.g., np.matrix is an “anyarray”, but is not a “duck array”). This is
+a classic bike-shed so for now we’re just using “duck array”. Some
+possible options though include: arrayish, pseudoarray, nominalarray,
+ersatzarray, arraymimic, ...
+
+
+General approach
+~~~~~~~~~~~~~~~~
+
+At a high level, duck array support requires working through each of
+the API functions provided by NumPy, and figuring out how it can be
+extended to work with duck array objects. In some cases this is easy
+(e.g., methods/attributes on ndarray itself); in other cases it’s more
+difficult. Here are some principles we’ve found useful so far:
+
+
+Principle 1: Focus on “full” duck arrays, but don’t rule out “partial” duck arrays
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We can distinguish between two classes:
+
+* “full” duck arrays, which aspire to fully implement np.ndarray’s
+  Python-level APIs and work essentially anywhere that np.ndarray
+  works
+
+* “partial” duck arrays, which intentionally implement only a subset
+  of np.ndarray’s API.
+
+Full duck arrays are, well, kind of boring. They have exactly the same
+semantics as ndarray, with differences being restricted to
+under-the-hood decisions about how the data is actually stored. The
+kind of people that are excited about making numpy more extensible are
+also, unsurprisingly, excited about changing or extending numpy’s
+semantics. So there’s been a lot of discussion of how to best support
+partial duck arrays. We've been guilty of this ourself.
+
+At this point though, we think the best general strategy is to focus
+our efforts primarily on supporting full duck arrays, and only worry
+about partial duck arrays as much as we need to to make sure we don't
+accidentally rule them out for no reason.
+
+Why focus on full duck arrays? Several reasons:
+
+First, there are lots of very clear use cases. Potential consumers of
+the full duck array interface include almost every package that uses
+numpy (scipy, sklearn, astropy, ...), and in particular packages that
+provide array-wrapping-classes that handle multiple types of arrays,
+such as xarray and dask.array. Potential implementers of the full duck
+array interface include: distributed arrays, sparse arrays, masked
+arrays, arrays with units (unless they switch to using dtypes),
+labeled arrays, and so forth. Clear use cases lead to good and
+relevant APIs.
+
+Second, the Anna Karenina principle applies here: full duck arrays are
+all alike, but every partial duck array is partial in its own way:
+
+* ``xarray.DataArray`` is mostly a duck array, but has incompatible
+  broadcasting semantics.
+* ``xarray.Dataset`` wraps multiple arrays in one object; it still
+  implements some array interfaces like ``__array_ufunc__``, but
+  certainly not all of them.
+* ``pandas.Series`` has methods with similar behavior to numpy, but
+  unique null-skipping behavior.
+* scipy’s ``LinearOperator``\s support matrix multiplication and nothing else
+* h5py and similar libraries for accessing array storage have objects
+  that support numpy-like slicing and conversion into a full array,
+  but not computation.
+* Some classes may be similar to ndarray, but without supporting the
+  full indexing semantics.
+
+And so forth.
+
+Despite our best attempts, we haven't found any clear, unique way of
+slicing up the ndarray API into a hierarchy of related types that
+captures these distinctions; in fact, it’s unlikely that any single
+person even understands all the distinctions. And this is important,
+because we have a *lot* of APIs that we need to add duck array support
+to (both in numpy and in all the projects that depend on numpy!). By
+definition, these already work for ``ndarray``, so hopefully getting
+them to work for full duck arrays shouldn’t be so hard, since by
+definition full duck arrays act like ``ndarray``. It’d be very
+cumbersome to have to go through each function and identify the exact
+subset of the ndarray API that it needs, then figure out which partial
+array types can/should support it. Once we have things working for
+full duck arrays, we can go back later and refine the APIs needed
+further as needed. Focusing on full duck arrays allows us to start
+making progress immediately.
+
+In the future, it might be useful to identify specific use cases for
+duck arrays and standardize narrower interfaces targeted just at those
+use cases. For example, it might make sense to have a standard “array
+loader” interface that file access libraries like h5py, netcdf, pydap,
+zarr, ... all implement, to make it easy to switch between these
+libraries. But that’s something that we can do as we go, and it
+doesn’t necessarily have to involve the NumPy devs at all. For an
+example of what this might look like, see the documentation for
+`dask.array.from_array
+<http://dask.pydata.org/en/latest/array-api.html#dask.array.from_array>`__.
+
+
+Principle 2: Take advantage of duck typing
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``ndarray`` has a very large API surface area::
+
+    In [1]: len(set(dir(np.ndarray)) - set(dir(object)))
+    Out[1]: 138
+
+And this is a huge **under**\estimate, because there are also many
+free-standing functions in NumPy and other libraries which currently
+use the NumPy C API and thus only work on ``ndarray`` objects. In type
+theory, a type is defined by the operations you can perform on an
+object; thus, the actual type of ``ndarray`` includes not just its
+methods and attributes, but *all* of these functions. For duck arrays
+to be successful, they’ll need to implement a large proportion of the
+``ndarray`` API – but not all of it. (For example,
+``dask.array.Array`` does not provide an equivalent to the
+``ndarray.ptp`` method, presumably because no-one has ever noticed or
+cared about its absence. But this doesn’t seem to have stopped people
+from using dask.)
+
+This means that realistically, we can’t hope to define the whole duck
+array API up front, or that anyone will be able to implement it all in
+one go; this will be an incremental process. It also means that even
+the so-called “full” duck array interface is somewhat fuzzily defined
+at the borders; there are parts of the ``np.ndarray`` API that duck
+arrays won’t have to implement, but we aren’t entirely sure what those
+are.
+
+And ultimately, it isn’t really up to the NumPy developers to define
+what does or doesn’t qualify as a duck array. If we want scikit-learn
+functions to work on dask arrays (for example), then that’s going to
+require negotiation between those two projects to discover
+incompatibilities, and when an incompatibility is discovered it will
+be up to them to negotiate who should change and how. The NumPy
+project can provide technical tools and general advice to help resolve
+these disagreements, but we can’t force one group or another to take
+responsibility for any given bug.
+
+Therefore, even though we’re focusing on “full” duck arrays, we
+*don’t* attempt to define a normative “array ABC” – maybe this will be
+useful someday, but right now, it’s not. And as a convenient
+side-effect, the lack of a normative definition leaves partial duck
+arrays room to experiment.
+
+But, we do provide some more detailed advice for duck array
+implementers and consumers below.
+
+Principle 3: Focus on protocols
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Historically, numpy has had lots of success at interoperating with
+third-party objects by defining *protocols*, like ``__array__`` (asks
+an arbitrary object to convert itself into an array),
+``__array_interface__`` (a precursor to Python’s buffer protocol), and
+``__array_ufunc__`` (allows third-party objects to support ufuncs like
+``np.exp``).
+
+`NEP 16 <https://github.com/numpy/numpy/pull/10706>`_ took a
+different approach: we need a duck-array equivalent of
+``asarray``, and it proposed to do this by defining a version of
+``asarray`` that would let through objects which implemented a new
+AbstractArray ABC. As noted above, we now think that trying to define
+an ABC is a bad idea for other reasons. But when this NEP was
+discussed on the mailing list, we realized that even on its own
+merits, this idea is not so great. A better approach is to define a
+*method* that can be called on an arbitrary object to ask it to
+convert itself into a duck array, and then define a version of
+``asarray`` that calls this method.
+
+This is strictly more powerful: if an object is already a duck array,
+it can simply ``return self``. It allows more correct semantics: NEP
+16 assumed that ``asarray(obj, dtype=X)`` is the same as
+``asarray(obj).astype(X)``, but this isn’t true. And it supports more
+use cases: if h5py supported sparse arrays, it might want to provide
+an object which is not itself a sparse array, but which can be
+automatically converted into a sparse array. See NEP <XX, to be
+written> for full details.
+
+The protocol approach is also more consistent with core Python
+conventions: for example, see the ``__iter__`` method for coercing
+objects to iterators, or the ``__index__`` protocol for safe integer
+coercion. And finally, focusing on protocols leaves the door open for
+partial duck arrays, which can pick and choose which subset of the
+protocols they want to participate in, each of which have well-defined
+semantics.
+
+Conclusion: protocols are one honking great idea – let’s do more of
+those.
+
+Principle 4: Reuse existing methods when possible
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It’s tempting to try to define cleaned up versions of ndarray methods
+with a more minimal interface to allow for easier implementation. For
+example, ``__array_reshape__`` could drop some of the strange
+arguments accepted by ``reshape`` and ``__array_basic_getitem__``
+could drop all the `strange edge cases
+<http://www.numpy.org/neps/nep-0021-advanced-indexing.html>`__ of
+NumPy’s advanced indexing.
+
+But as discussed above, we don’t really know what APIs we need for
+duck-typing ndarray. We would inevitably end up with a very long list
+of new special methods. In contrast, existing methods like ``reshape``
+and ``__getitem__`` have the advantage of already being widely
+used/exercised by libraries that use duck arrays, and in practice, any
+serious duck array type is going to have to implement them anyway.
+
+Principle 5: Make it easy to do the right thing
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Making duck arrays work well is going to be a community effort.
+Documentation helps, but only goes so far. We want to make it easy to
+implement duck arrays that do the right thing.
+
+One way NumPy can help is by providing mixin classes for implementing
+large groups of related functionality at once.
+``NDArrayOperatorsMixin`` is a good example: it allows for
+implementing arithmetic operators implicitly via the
+``__array_ufunc__`` method. It’s not complete, and we’ll want more
+helpers like that (e.g. for reductions).
+
+(We initially thought that the importance of these mixins might be an
+argument for providing an array ABC, since that’s the standard way to
+do mixins in modern Python. But in discussion around NEP 16 we
+realized that partial duck arrays also wanted to take advantage of
+these mixins in some cases, so even if we did have an array ABC then
+the mixins would still need some sort of separate existence. So never
+mind that argument.)
+
+Tentative duck array guidelines
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As a general rule, libraries using duck arrays should insist upon the
+minimum possible requirements, and libraries implementing duck arrays
+should provide as complete of an API as possible. This will ensure
+maximum compatibility. For example, users should prefer to rely on
+``.transpose()`` rather than ``.swapaxes()`` (which can be implemented
+in terms of transpose), but duck array authors should ideally
+implement both.
+
+If you are trying to implement a duck array, then you should strive to
+implement everything. You certainly need ``.shape``, ``.ndim`` and
+``.dtype``, but also your dtype attribute should actually be a
+``numpy.dtype`` object, weird fancy indexing edge cases should ideally
+work, etc. Only details related to NumPy’s specific ``np.ndarray``
+implementation (e.g., ``strides``, ``data``, ``view``) are explicitly
+out of scope.
+
+A (very) rough sketch of future plans
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The proposals discussed so far – ``__array_ufunc__`` and some kind of
+``asarray`` protocol – are clearly necessary but not sufficient for
+full duck typing support. We expect the need for additional protocols
+to support (at least) these features:
+
+* **Concatenating** duck arrays, which would be used internally by other
+  array combining methods like stack/vstack/hstack. The implementation
+  of concatenate will need to be negotiated among the list of array
+  arguments. We expect to use an ``__array_concatenate__`` protocol
+  like ``__array_ufunc__`` instead of multiple dispatch.
+* **Ufunc-like functions** that currently aren’t ufuncs. Many NumPy
+  functions like median, percentile, sort, where and clip could be
+  written as generalized ufuncs but currently aren’t. Either these
+  functions should be written as ufuncs, or we should consider adding
+  another generic wrapper mechanism that works similarly to ufuncs but
+  makes fewer guarantees about how the implementation is done.
+* **Random number generation** with duck arrays, e.g.,
+  ``np.random.randn()``. For example, we might want to add new APIs
+  like ``random_like()`` for generating new arrays with a matching
+  shape *and* type – though we'll need to look at some real examples
+  of how these functions are used to figure out what would be helpful.
+* **Miscellaneous other functions** such as ``np.einsum``,
+  ``np.zeros_like``, and ``np.broadcast_to`` that don’t fall into any
+  of the above categories.
+* **Checking mutability** on duck arrays, which would imply that they
+  support assignment with ``__setitem__`` and the out argument to
+  ufuncs. Many otherwise fine duck arrays are not easily mutable (for
+  example, because they use some kinds of sparse or compressed
+  storage, or are in read-only shared memory), and it turns out that
+  frequently-used code like the default implementation of ``np.mean``
+  needs to check this (to decide whether it can re-use temporary
+  arrays).
+
+We intentionally do not describe exactly how to add support for these
+types of duck arrays here. These will be the subject of future NEPs.
+
+
+Copyright
+---------
+
+This document has been placed in the public domain.
diff --git a/doc/neps/tools/build_index.py b/doc/neps/tools/build_index.py
index 65225c995..d9c4f690b 100644
--- a/doc/neps/tools/build_index.py
+++ b/doc/neps/tools/build_index.py
@@ -40,6 +40,10 @@ def nep_metadata():
             tags['Title'] = lines[1].strip()
             tags['Filename'] = source
 
+        if not tags['Title'].startswith(f'NEP {nr} — '):
+            raise RuntimeError(
+                f'Title for NEP {nr} does not start with "NEP {nr} — " '
+                '(note that — here is a special, enlongated dash)')
 
         if tags['Status'] in ('Accepted', 'Rejected', 'Withdrawn'):
             if not 'Resolution' in tags:
diff --git a/doc/release/1.16.0-notes.rst b/doc/release/1.16.0-notes.rst
index 8df763b56..3daa4ae97 100644
--- a/doc/release/1.16.0-notes.rst
+++ b/doc/release/1.16.0-notes.rst
@@ -41,6 +41,12 @@ Even when no elements needed to be drawn, ``np.random.randint`` and
 distribution. This has been fixed so that e.g.
 ``np.random.choice([], 0) == np.array([], dtype=float64)``.
 
+ARM support updated
+-------------------
+Support for ARM CPUs has been updated to accommodate 32 and 64 bit targets,
+and also big and little endian byte ordering. AARCH32 memory alignment issues
+have been addressed.
+
 Changes
 =======
 
diff --git a/numpy/__init__.py b/numpy/__init__.py
index 77b1d924d..b912d2222 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -139,9 +139,7 @@ else:
         loader = PackageLoader(infunc=True)
         return loader(*packages, **options)
 
-    from . import add_newdocs
-    __all__ = ['add_newdocs',
-               'ModuleDeprecationWarning',
+    __all__ = ['ModuleDeprecationWarning',
                'VisibleDeprecationWarning']
 
     pkgload.__doc__ = PackageLoader.__call__.__doc__
@@ -191,7 +189,7 @@ else:
     from .testing import Tester
 
     # Pytest testing
-    from numpy.testing._private.pytesttester import PytestTester
+    from numpy._pytesttester import PytestTester
     test = PytestTester(__name__)
     del PytestTester
 
@@ -214,7 +212,9 @@ else:
         except AssertionError:
             msg = ("The current Numpy installation ({!r}) fails to "
                    "pass simple sanity checks. This can be caused for example "
-                   "by incorrect BLAS library being linked in.")
+                   "by incorrect BLAS library being linked in, or by mixing "
+                   "package managers (pip, conda, apt, ...). Search closed "
+                   "numpy issues for similar problems.")
             raise RuntimeError(msg.format(__file__))
 
     _sanity_check()
diff --git a/numpy/testing/_private/pytesttester.py b/numpy/_pytesttester.py
index 8c73fafa4..6a1b3274e 100644
--- a/numpy/testing/_private/pytesttester.py
+++ b/numpy/_pytesttester.py
@@ -5,7 +5,7 @@ This module implements the ``test()`` function for NumPy modules. The usual
 boiler plate for doing that is to put the following in the module
 ``__init__.py`` file::
 
-    from numpy.testing import PytestTester
+    from numpy._pytesttester import PytestTester
     test = PytestTester(__name__).test
     del PytestTester
 
@@ -23,6 +23,9 @@ whether or not that file is found as follows:
 In practice, tests run from the numpy repo are run in develop mode. That
 includes the standard ``python runtests.py`` invocation.
 
+This module is imported by every numpy subpackage, so lies at the top level to
+simplify circular import issues. For the same reason, it contains no numpy
+imports at module scope, instead importing numpy within function calls.
 """
 from __future__ import division, absolute_import, print_function
 
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index 4d9cbf5da..9ef30b018 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -59,6 +59,10 @@ del nt
 from .fromnumeric import amax as max, amin as min, round_ as round
 from .numeric import absolute as abs
 
+# do this after everything else, to minimize the chance of this misleadingly
+# appearing in an import-time traceback
+from . import _add_newdocs
+
 __all__ = ['char', 'rec', 'memmap']
 __all__ += numeric.__all__
 __all__ += fromnumeric.__all__
@@ -100,6 +104,6 @@ del copyreg
 del sys
 del _ufunc_reduce
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/add_newdocs.py b/numpy/core/_add_newdocs.py
index a882bf1e0..f596e613f 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -10,7 +10,7 @@ NOTE: Many of the methods of ndarray have corresponding functions.
 """
 from __future__ import division, absolute_import, print_function
 
-from numpy.lib import add_newdoc
+from numpy.core.function_base import add_newdoc
 
 ###############################################################################
 #
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index 2fac3caf3..163f125c2 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -4,6 +4,8 @@ Implementation of optimized einsum.
 """
 from __future__ import division, absolute_import, print_function
 
+import itertools
+
 from numpy.compat import basestring
 from numpy.core.multiarray import c_einsum
 from numpy.core.numeric import asarray, asanyarray, result_type, tensordot, dot
@@ -14,6 +16,44 @@ einsum_symbols = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 einsum_symbols_set = set(einsum_symbols)
 
 
+def _flop_count(idx_contraction, inner, num_terms, size_dictionary):
+    """
+    Computes the number of FLOPS in the contraction.
+
+    Parameters
+    ----------
+    idx_contraction : iterable
+        The indices involved in the contraction
+    inner : bool
+        Does this contraction require an inner product?
+    num_terms : int
+        The number of terms in a contraction
+    size_dictionary : dict
+        The size of each of the indices in idx_contraction
+
+    Returns
+    -------
+    flop_count : int
+        The total number of FLOPS required for the contraction.
+
+    Examples
+    --------
+
+    >>> _flop_count('abc', False, 1, {'a': 2, 'b':3, 'c':5})
+    90
+
+    >>> _flop_count('abc', True, 2, {'a': 2, 'b':3, 'c':5})
+    270
+
+    """
+
+    overall_size = _compute_size_by_dict(idx_contraction, size_dictionary)
+    op_factor = max(1, num_terms - 1)
+    if inner:
+        op_factor += 1
+
+    return overall_size * op_factor
+
 def _compute_size_by_dict(indices, idx_dict):
     """
     Computes the product of the elements in indices based on the dictionary
@@ -139,14 +179,9 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
         iter_results = []
 
         # Compute all unique pairs
-        comb_iter = []
-        for x in range(len(input_sets) - iteration):
-            for y in range(x + 1, len(input_sets) - iteration):
-                comb_iter.append((x, y))
-
         for curr in full_results:
             cost, positions, remaining = curr
-            for con in comb_iter:
+            for con in itertools.combinations(range(len(input_sets) - iteration), 2):
 
                 # Find the contraction
                 cont = _find_contraction(con, remaining, output_set)
@@ -157,15 +192,10 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
                 if new_size > memory_limit:
                     continue
 
-                # Find cost
-                new_cost = _compute_size_by_dict(idx_contract, idx_dict)
-                if idx_removed:
-                    new_cost *= 2
-
                 # Build (total_cost, positions, indices_remaining)
-                new_cost += cost
+                total_cost =  cost + _flop_count(idx_contract, idx_removed, len(con), idx_dict)
                 new_pos = positions + [con]
-                iter_results.append((new_cost, new_pos, new_input_sets))
+                iter_results.append((total_cost, new_pos, new_input_sets))
 
         # Update combinatorial list, if we did not find anything return best
         # path + remaining contractions
@@ -183,6 +213,102 @@ def _optimal_path(input_sets, output_set, idx_dict, memory_limit):
     path = min(full_results, key=lambda x: x[0])[1]
     return path
 
+def _parse_possible_contraction(positions, input_sets, output_set, idx_dict, memory_limit, path_cost, naive_cost):
+    """Compute the cost (removed size + flops) and resultant indices for
+    performing the contraction specified by ``positions``.
+
+    Parameters
+    ----------
+    positions : tuple of int
+        The locations of the proposed tensors to contract.
+    input_sets : list of sets
+        The indices found on each tensors.
+    output_set : set
+        The output indices of the expression.
+    idx_dict : dict
+        Mapping of each index to its size.
+    memory_limit : int
+        The total allowed size for an intermediary tensor.
+    path_cost : int
+        The contraction cost so far.
+    naive_cost : int
+        The cost of the unoptimized expression.
+
+    Returns
+    -------
+    cost : (int, int)
+        A tuple containing the size of any indices removed, and the flop cost.
+    positions : tuple of int
+        The locations of the proposed tensors to contract.
+    new_input_sets : list of sets
+        The resulting new list of indices if this proposed contraction is performed.
+
+    """
+
+    # Find the contraction
+    contract = _find_contraction(positions, input_sets, output_set)
+    idx_result, new_input_sets, idx_removed, idx_contract = contract
+
+    # Sieve the results based on memory_limit
+    new_size = _compute_size_by_dict(idx_result, idx_dict)
+    if new_size > memory_limit:
+        return None
+
+    # Build sort tuple
+    old_sizes = (_compute_size_by_dict(input_sets[p], idx_dict) for p in positions)
+    removed_size = sum(old_sizes) - new_size
+
+    # NB: removed_size used to be just the size of any removed indices i.e.:
+    #     helpers.compute_size_by_dict(idx_removed, idx_dict)
+    cost = _flop_count(idx_contract, idx_removed, len(positions), idx_dict)
+    sort = (-removed_size, cost)
+
+    # Sieve based on total cost as well
+    if (path_cost + cost) > naive_cost:
+        return None
+
+    # Add contraction to possible choices
+    return [sort, positions, new_input_sets]
+
+
+def _update_other_results(results, best):
+    """Update the positions and provisional input_sets of ``results`` based on
+    performing the contraction result ``best``. Remove any involving the tensors
+    contracted.
+
+    Parameters
+    ----------
+    results : list
+        List of contraction results produced by ``_parse_possible_contraction``.
+    best : list
+        The best contraction of ``results`` i.e. the one that will be performed.
+
+    Returns
+    -------
+    mod_results : list
+        The list of modifed results, updated with outcome of ``best`` contraction.
+    """
+
+    best_con = best[1]
+    bx, by = best_con
+    mod_results = []
+
+    for cost, (x, y), con_sets in results:
+
+        # Ignore results involving tensors just contracted
+        if x in best_con or y in best_con:
+            continue
+
+        # Update the input_sets
+        del con_sets[by - int(by > x) - int(by > y)]
+        del con_sets[bx - int(bx > x) - int(bx > y)]
+        con_sets.insert(-1, best[2][-1])
+
+        # Update the position indices
+        mod_con = x - int(x > bx) - int(x > by), y - int(y > bx) - int(y > by)
+        mod_results.append((cost, mod_con, con_sets))
+
+    return mod_results
 
 def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
     """
@@ -219,46 +345,68 @@ def _greedy_path(input_sets, output_set, idx_dict, memory_limit):
     [(0, 2), (0, 1)]
     """
 
+    # Handle trivial cases that leaked through
     if len(input_sets) == 1:
         return [(0,)]
+    elif len(input_sets) == 2:
+        return [(0, 1)]
+
+    # Build up a naive cost
+    contract = _find_contraction(range(len(input_sets)), input_sets, output_set)
+    idx_result, new_input_sets, idx_removed, idx_contract = contract
+    naive_cost = _flop_count(idx_contract, idx_removed, len(input_sets), idx_dict)
 
+    # Initially iterate over all pairs
+    comb_iter = itertools.combinations(range(len(input_sets)), 2)
+    known_contractions = []
+
+    path_cost = 0
     path = []
-    for iteration in range(len(input_sets) - 1):
-        iteration_results = []
-        comb_iter = []
 
-        # Compute all unique pairs
-        for x in range(len(input_sets)):
-            for y in range(x + 1, len(input_sets)):
-                comb_iter.append((x, y))
+    for iteration in range(len(input_sets) - 1):
 
+        # Iterate over all pairs on first step, only previously found pairs on subsequent steps
         for positions in comb_iter:
 
-            # Find the contraction
-            contract = _find_contraction(positions, input_sets, output_set)
-            idx_result, new_input_sets, idx_removed, idx_contract = contract
-
-            # Sieve the results based on memory_limit
-            if _compute_size_by_dict(idx_result, idx_dict) > memory_limit:
+            # Always initially ignore outer products
+            if input_sets[positions[0]].isdisjoint(input_sets[positions[1]]):
                 continue
 
-            # Build sort tuple
-            removed_size = _compute_size_by_dict(idx_removed, idx_dict)
-            cost = _compute_size_by_dict(idx_contract, idx_dict)
-            sort = (-removed_size, cost)
+            result = _parse_possible_contraction(positions, input_sets, output_set, idx_dict, memory_limit, path_cost,
+                                                 naive_cost)
+            if result is not None:
+                known_contractions.append(result)
 
-            # Add contraction to possible choices
-            iteration_results.append([sort, positions, new_input_sets])
+        # If we do not have a inner contraction, rescan pairs including outer products
+        if len(known_contractions) == 0:
 
-        # If we did not find a new contraction contract remaining
-        if len(iteration_results) == 0:
-            path.append(tuple(range(len(input_sets))))
-            break
+            # Then check the outer products
+            for positions in itertools.combinations(range(len(input_sets)), 2):
+                result = _parse_possible_contraction(positions, input_sets, output_set, idx_dict, memory_limit,
+                                                     path_cost, naive_cost)
+                if result is not None:
+                    known_contractions.append(result)
+
+            # If we still did not find any remaining contractions, default back to einsum like behavior
+            if len(known_contractions) == 0:
+                path.append(tuple(range(len(input_sets))))
+                break
 
         # Sort based on first index
-        best = min(iteration_results, key=lambda x: x[0])
-        path.append(best[1])
+        best = min(known_contractions, key=lambda x: x[0])
+
+        # Now propagate as many unused contractions as possible to next iteration
+        known_contractions = _update_other_results(known_contractions, best)
+
+        # Next iteration only compute contractions with the new tensor
+        # All other contractions have been accounted for
         input_sets = best[2]
+        new_tensor_pos = len(input_sets) - 1
+        comb_iter = ((i, new_tensor_pos) for i in range(new_tensor_pos))
+
+        # Update path and total cost
+        path.append(best[1])
+        path_cost += best[0][1]
 
     return path
 
@@ -314,26 +462,27 @@ def _can_dot(inputs, result, idx_removed):
     if len(inputs) != 2:
         return False
 
-    # Build a few temporaries
     input_left, input_right = inputs
+
+    for c in set(input_left + input_right):
+        # can't deal with repeated indices on same input or more than 2 total
+        nl, nr = input_left.count(c), input_right.count(c)
+        if (nl > 1) or (nr > 1) or (nl + nr > 2):
+            return False
+
+        # can't do implicit summation or dimension collapse e.g.
+        #     "ab,bc->c" (implicitly sum over 'a')
+        #     "ab,ca->ca" (take diagonal of 'a')
+        if nl + nr - 1 == int(c in result):
+            return False
+
+    # Build a few temporaries
     set_left = set(input_left)
     set_right = set(input_right)
     keep_left = set_left - idx_removed
     keep_right = set_right - idx_removed
     rs = len(idx_removed)
 
-    # Indices must overlap between the two operands
-    if not len(set_left & set_right):
-        return False
-
-    # We cannot have duplicate indices ("ijj, jk -> ik")
-    if (len(set_left) != len(input_left)) or (len(set_right) != len(input_right)):
-        return False
-
-    # Cannot handle partial inner ("ij, ji -> i")
-    if len(keep_left & keep_right):
-        return False
-
     # At this point we are a DOT, GEMV, or GEMM operation
 
     # Handle inner products
@@ -698,6 +847,7 @@ def einsum_path(*operands, **kwargs):
 
     # Get length of each unique dimension and ensure all dimensions are correct
     dimension_dict = {}
+    broadcast_indices = [[] for x in range(len(input_list))]
     for tnum, term in enumerate(input_list):
         sh = operands[tnum].shape
         if len(sh) != len(term):
@@ -706,6 +856,11 @@ def einsum_path(*operands, **kwargs):
                              % (input_subscripts[tnum], tnum))
         for cnum, char in enumerate(term):
             dim = sh[cnum]
+
+            # Build out broadcast indices
+            if dim == 1:
+                broadcast_indices[tnum].append(char)
+
             if char in dimension_dict.keys():
                 # For broadcasting cases we always want the largest dim size
                 if dimension_dict[char] == 1:
@@ -717,6 +872,9 @@ def einsum_path(*operands, **kwargs):
             else:
                 dimension_dict[char] = dim
 
+    # Convert broadcast inds to sets
+    broadcast_indices = [set(x) for x in broadcast_indices]
+
     # Compute size of each input array plus the output array
     size_list = []
     for term in input_list + [output_subscript]:
@@ -730,20 +888,14 @@ def einsum_path(*operands, **kwargs):
 
     # Compute naive cost
     # This isn't quite right, need to look into exactly how einsum does this
-    naive_cost = _compute_size_by_dict(indices, dimension_dict)
-    indices_in_input = input_subscripts.replace(',', '')
-    mult = max(len(input_list) - 1, 1)
-    if (len(indices_in_input) - len(set(indices_in_input))):
-        mult *= 2
-    naive_cost *= mult
+    inner_product = (sum(len(x) for x in input_sets) - len(indices)) > 0
+    naive_cost = _flop_count(indices, inner_product, len(input_list), dimension_dict)
 
     # Compute the path
     if (path_type is False) or (len(input_list) in [1, 2]) or (indices == output_set):
         # Nothing to be optimized, leave it to einsum
         path = [tuple(range(len(input_list)))]
     elif path_type == "greedy":
-        # Maximum memory should be at most out_size for this algorithm
-        memory_arg = min(memory_arg, max_size)
         path = _greedy_path(input_sets, output_set, dimension_dict, memory_arg)
     elif path_type == "optimal":
         path = _optimal_path(input_sets, output_set, dimension_dict, memory_arg)
@@ -762,18 +914,24 @@ def einsum_path(*operands, **kwargs):
         contract = _find_contraction(contract_inds, input_sets, output_set)
         out_inds, input_sets, idx_removed, idx_contract = contract
 
-        cost = _compute_size_by_dict(idx_contract, dimension_dict)
-        if idx_removed:
-            cost *= 2
+        cost = _flop_count(idx_contract, idx_removed, len(contract_inds), dimension_dict)
         cost_list.append(cost)
         scale_list.append(len(idx_contract))
         size_list.append(_compute_size_by_dict(out_inds, dimension_dict))
 
+        bcast = set()
         tmp_inputs = []
         for x in contract_inds:
             tmp_inputs.append(input_list.pop(x))
+            bcast |= broadcast_indices.pop(x)
 
-        do_blas = _can_dot(tmp_inputs, out_inds, idx_removed)
+        new_bcast_inds = bcast - idx_removed
+
+        # If we're broadcasting, nix blas
+        if not len(idx_removed & bcast):
+            do_blas = _can_dot(tmp_inputs, out_inds, idx_removed)
+        else:
+            do_blas = False
 
         # Last contraction
         if (cnum - len(path)) == -1:
@@ -783,6 +941,7 @@ def einsum_path(*operands, **kwargs):
             idx_result = "".join([x[1] for x in sorted(sort_result)])
 
         input_list.append(idx_result)
+        broadcast_indices.append(new_bcast_inds)
         einsum_str = ",".join(tmp_inputs) + "->" + idx_result
 
         contraction = (contract_inds, idx_removed, einsum_str, input_list[:], do_blas)
@@ -1200,25 +1359,14 @@ def einsum(*operands, **kwargs):
             tmp_operands.append(operands.pop(x))
 
         # Do we need to deal with the output?
-        if specified_out and ((num + 1) == len(contraction_list)):
-            handle_out = True
+        handle_out = specified_out and ((num + 1) == len(contraction_list))
 
-        # Handle broadcasting vs BLAS cases
+        # Call tensordot if still possible
         if blas:
             # Checks have already been handled
             input_str, results_index = einsum_str.split('->')
             input_left, input_right = input_str.split(',')
-            if 1 in tmp_operands[0].shape or 1 in tmp_operands[1].shape:
-                left_dims = {dim: size for dim, size in
-                             zip(input_left, tmp_operands[0].shape)}
-                right_dims = {dim: size for dim, size in
-                              zip(input_right, tmp_operands[1].shape)}
-                # If dims do not match we are broadcasting, BLAS off
-                if any(left_dims[ind] != right_dims[ind] for ind in idx_rm):
-                    blas = False
 
-        # Call tensordot if still possible
-        if blas:
             tensor_result = input_left + input_right
             for s in idx_rm:
                 tensor_result = tensor_result.replace(s, "")
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index 82de1a36e..fb72bada5 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -6,6 +6,7 @@ import operator
 from . import numeric as _nx
 from .numeric import (result_type, NaN, shares_memory, MAY_SHARE_BOUNDS,
                       TooHardError,asanyarray)
+from numpy.core.multiarray import add_docstring
 
 __all__ = ['logspace', 'linspace', 'geomspace']
 
@@ -356,3 +357,38 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None):
                                  endpoint=endpoint, base=10.0, dtype=dtype)
 
     return result.astype(dtype)
+
+
+#always succeed
+def add_newdoc(place, obj, doc):
+    """
+    Adds documentation to obj which is in module place.
+
+    If doc is a string add it to obj as a docstring
+
+    If doc is a tuple, then the first element is interpreted as
+       an attribute of obj and the second as the docstring
+          (method, docstring)
+
+    If doc is a list, then each element of the list should be a
+       sequence of length two --> [(method1, docstring1),
+       (method2, docstring2), ...]
+
+    This routine never raises an error.
+
+    This routine cannot modify read-only docstrings, as appear
+    in new-style classes or built-in functions. Because this
+    routine never raises an error the caller must check manually
+    that the docstrings were changed.
+    """
+    try:
+        new = getattr(__import__(place, globals(), {}, [obj]), obj)
+        if isinstance(doc, str):
+            add_docstring(new, doc.strip())
+        elif isinstance(doc, tuple):
+            add_docstring(getattr(new, doc[0]), doc[1].strip())
+        elif isinstance(doc, list):
+            for val in doc:
+                add_docstring(getattr(new, val[0]), val[1].strip())
+    except Exception:
+        pass
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index f2c61a0a1..c712fd3ef 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -63,10 +63,27 @@
     #define NPY_CPU_HPPA
 #elif defined(__alpha__)
     #define NPY_CPU_ALPHA
-#elif defined(__arm__) && defined(__ARMEL__)
-    #define NPY_CPU_ARMEL
-#elif defined(__arm__) && defined(__ARMEB__)
-    #define NPY_CPU_ARMEB
+#elif defined(__arm__)
+    #if defined(__ARMEB__)
+        #if defined(__ARM_32BIT_STATE)
+            #define NPY_CPU_ARMEB_AARCH32
+        #elif defined(__ARM_64BIT_STATE)
+            #define NPY_CPU_ARMEB_AARCH64
+        #else
+            #define NPY_CPU_ARMEB
+        #endif
+    #elif defined(__ARMEL__)
+        #if defined(__ARM_32BIT_STATE)
+            #define NPY_CPU_ARMEL_AARCH32
+        #elif defined(__ARM_64BIT_STATE)
+            #define NPY_CPU_ARMEL_AARCH64
+        #else
+            #define NPY_CPU_ARMEL
+        #endif
+    #else
+        # error Unknown ARM CPU, please report this to numpy maintainers with \
+	information about your platform (OS, CPU and compiler)
+    #endif
 #elif defined(__sh__) && defined(__LITTLE_ENDIAN__)
     #define NPY_CPU_SH_LE
 #elif defined(__sh__) && defined(__BIG_ENDIAN__)
@@ -77,8 +94,6 @@
     #define NPY_CPU_MIPSEB
 #elif defined(__or1k__)
     #define NPY_CPU_OR1K
-#elif defined(__aarch64__)
-    #define NPY_CPU_AARCH64
 #elif defined(__mc68000__)
     #define NPY_CPU_M68K
 #elif defined(__arc__) && defined(__LITTLE_ENDIAN__)
diff --git a/numpy/core/include/numpy/npy_endian.h b/numpy/core/include/numpy/npy_endian.h
index 649bdb0a6..44cdffd14 100644
--- a/numpy/core/include/numpy/npy_endian.h
+++ b/numpy/core/include/numpy/npy_endian.h
@@ -37,28 +37,31 @@
     #define NPY_LITTLE_ENDIAN 1234
     #define NPY_BIG_ENDIAN 4321
 
-    #if defined(NPY_CPU_X86)            \
-            || defined(NPY_CPU_AMD64)   \
-            || defined(NPY_CPU_IA64)    \
-            || defined(NPY_CPU_ALPHA)   \
-            || defined(NPY_CPU_ARMEL)   \
-            || defined(NPY_CPU_AARCH64) \
-            || defined(NPY_CPU_SH_LE)   \
-            || defined(NPY_CPU_MIPSEL)  \
-            || defined(NPY_CPU_PPC64LE) \
-            || defined(NPY_CPU_ARCEL)   \
+    #if defined(NPY_CPU_X86)                  \
+            || defined(NPY_CPU_AMD64)         \
+            || defined(NPY_CPU_IA64)          \
+            || defined(NPY_CPU_ALPHA)         \
+            || defined(NPY_CPU_ARMEL)         \
+            || defined(NPY_CPU_ARMEL_AARCH32) \
+            || defined(NPY_CPU_ARMEL_AARCH64) \
+            || defined(NPY_CPU_SH_LE)         \
+            || defined(NPY_CPU_MIPSEL)        \
+            || defined(NPY_CPU_PPC64LE)       \
+            || defined(NPY_CPU_ARCEL)         \
             || defined(NPY_CPU_RISCV64)
         #define NPY_BYTE_ORDER NPY_LITTLE_ENDIAN
-    #elif defined(NPY_CPU_PPC)          \
-            || defined(NPY_CPU_SPARC)   \
-            || defined(NPY_CPU_S390)    \
-            || defined(NPY_CPU_HPPA)    \
-            || defined(NPY_CPU_PPC64)   \
-            || defined(NPY_CPU_ARMEB)   \
-            || defined(NPY_CPU_SH_BE)   \
-            || defined(NPY_CPU_MIPSEB)  \
-            || defined(NPY_CPU_OR1K)    \
-            || defined(NPY_CPU_M68K)    \
+    #elif defined(NPY_CPU_PPC)                \
+            || defined(NPY_CPU_SPARC)         \
+            || defined(NPY_CPU_S390)          \
+            || defined(NPY_CPU_HPPA)          \
+            || defined(NPY_CPU_PPC64)         \
+            || defined(NPY_CPU_ARMEB)         \
+            || defined(NPY_CPU_ARMEB_AARCH32) \
+            || defined(NPY_CPU_ARMEB_AARCH64) \
+            || defined(NPY_CPU_SH_BE)         \
+            || defined(NPY_CPU_MIPSEB)        \
+            || defined(NPY_CPU_OR1K)          \
+            || defined(NPY_CPU_M68K)          \
             || defined(NPY_CPU_ARCEB)
         #define NPY_BYTE_ORDER NPY_BIG_ENDIAN
     #else
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 46ff78b9c..cdca1d606 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -2084,7 +2084,7 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
                                                PyArray_TRIVIALLY_ITERABLE_OP_READ,
                                                PyArray_TRIVIALLY_ITERABLE_OP_READ) ||
                  (PyArray_NDIM(tmp_arr) == 0 &&
-                        PyArray_TRIVIALLY_ITERABLE(tmp_arr))) &&
+                        PyArray_TRIVIALLY_ITERABLE(ind))) &&
                 /* Check if the type is equivalent to INTP */
                 PyArray_ITEMSIZE(ind) == sizeof(npy_intp) &&
                 PyArray_DESCR(ind)->kind == 'i' &&
diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h
index 094612b7d..f9c671f77 100644
--- a/numpy/core/src/private/lowlevel_strided_loops.h
+++ b/numpy/core/src/private/lowlevel_strided_loops.h
@@ -689,21 +689,16 @@ npy_bswap8_unaligned(char * x)
 #define PyArray_TRIVIALLY_ITERABLE_OP_NOREAD 0
 #define PyArray_TRIVIALLY_ITERABLE_OP_READ 1
 
-#define PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) (            \
-                        PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
-                        PyArray_CompareLists(PyArray_DIMS(arr1), \
-                                             PyArray_DIMS(arr2), \
-                                             PyArray_NDIM(arr1)) && \
-                        (PyArray_FLAGS(arr1)&(NPY_ARRAY_C_CONTIGUOUS| \
-                                      NPY_ARRAY_F_CONTIGUOUS)) & \
-                                (PyArray_FLAGS(arr2)&(NPY_ARRAY_C_CONTIGUOUS| \
-                                              NPY_ARRAY_F_CONTIGUOUS)) \
-                        )
+#define PyArray_TRIVIALLY_ITERABLE(arr) ( \
+                    PyArray_NDIM(arr) <= 1 || \
+                    PyArray_CHKFLAGS(arr, NPY_ARRAY_C_CONTIGUOUS) || \
+                    PyArray_CHKFLAGS(arr, NPY_ARRAY_F_CONTIGUOUS) \
+                    )
 
 #define PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size, arr) ( \
-                        size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
-                                          PyArray_STRIDE(arr, 0) : \
-                                          PyArray_ITEMSIZE(arr)))
+        assert(PyArray_TRIVIALLY_ITERABLE(arr)), \
+        size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
+                             PyArray_STRIDE(arr, 0) : PyArray_ITEMSIZE(arr)))
 
 static NPY_INLINE int
 PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr2,
@@ -757,15 +752,22 @@ PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr
     return (!arr1_read || arr1_ahead) && (!arr2_read || arr2_ahead);
 }
 
+#define PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) (            \
+                        PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
+                        PyArray_CompareLists(PyArray_DIMS(arr1), \
+                                             PyArray_DIMS(arr2), \
+                                             PyArray_NDIM(arr1)) && \
+                        (PyArray_FLAGS(arr1)&(NPY_ARRAY_C_CONTIGUOUS| \
+                                      NPY_ARRAY_F_CONTIGUOUS)) & \
+                                (PyArray_FLAGS(arr2)&(NPY_ARRAY_C_CONTIGUOUS| \
+                                              NPY_ARRAY_F_CONTIGUOUS)) \
+                        )
+
 #define PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2, arr1_read, arr2_read) ( \
                         PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) && \
                         PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK( \
                             arr1, arr2, arr1_read, arr2_read))
-#define PyArray_TRIVIALLY_ITERABLE(arr) ( \
-                    PyArray_NDIM(arr) <= 1 || \
-                    PyArray_CHKFLAGS(arr, NPY_ARRAY_C_CONTIGUOUS) || \
-                    PyArray_CHKFLAGS(arr, NPY_ARRAY_F_CONTIGUOUS) \
-                    )
+
 #define PyArray_PREPARE_TRIVIAL_ITERATION(arr, count, data, stride) \
                     count = PyArray_SIZE(arr); \
                     data = PyArray_BYTES(arr); \
@@ -774,7 +776,6 @@ PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr
                                             PyArray_STRIDE(arr, 0) : \
                                             PyArray_ITEMSIZE(arr)));
 
-
 #define PyArray_TRIVIALLY_ITERABLE_PAIR(arr1, arr2, arr1_read, arr2_read) (   \
                     PyArray_TRIVIALLY_ITERABLE(arr1) && \
                         (PyArray_NDIM(arr2) == 0 || \
diff --git a/numpy/core/src/private/npy_config.h b/numpy/core/src/private/npy_config.h
index 107b3cb5b..8143e7719 100644
--- a/numpy/core/src/private/npy_config.h
+++ b/numpy/core/src/private/npy_config.h
@@ -15,7 +15,8 @@
  * amd64 is not harmed much by the bloat as the system provides 16 byte
  * alignment by default.
  */
-#if (defined NPY_CPU_X86 || defined _WIN32)
+#if (defined NPY_CPU_X86 || defined _WIN32 || defined NPY_CPU_ARMEL_AARCH32 ||\
+     defined NPY_CPU_ARMEB_AARCH32)
 #define NPY_MAX_COPY_ALIGNMENT 8
 #else
 #define NPY_MAX_COPY_ALIGNMENT 16
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index b964c568e..a3fd72839 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -552,6 +552,181 @@ ufunc_get_name_cstr(PyUFuncObject *ufunc) {
 }
 
 /*
+ * Helpers for keyword parsing
+ */
+
+/*
+ * Find key in a list of pointers to keyword names.
+ * The list should end with NULL.
+ *
+ * Returns either the index into the list (pointing to the final key with NULL
+ * if no match was found), or -1 on failure.
+ */
+static npy_intp
+locate_key(PyObject **kwnames, PyObject *key)
+{
+    PyObject **kwname = kwnames;
+    while (*kwname != NULL && *kwname != key) {
+        kwname++;
+    }
+    /* Slow fallback, just in case */
+    if (*kwname == NULL) {
+        int cmp = 0;
+        kwname = kwnames;
+        while (*kwname != NULL &&
+               (cmp = PyObject_RichCompareBool(key, *kwname,
+                                               Py_EQ)) == 0) {
+            kwname++;
+        }
+        if (cmp < 0) {
+            return -1;
+        }
+    }
+    return kwname - kwnames;
+}
+
+/*
+ * Parse keyword arguments, matching against kwnames
+ *
+ * Arguments beyond kwnames (the va_list) should contain converters and outputs
+ * for each keyword name (where an output can be NULL to indicate the particular
+ * keyword should be ignored).
+ *
+ * Returns 0 on success, -1 on failure with an error set.
+ *
+ * Note that the parser does not clean up on failure, i.e., already parsed keyword
+ * values may hold new references, which the caller has to remove.
+ *
+ * TODO: ufunc is only used for the name in error messages; passing on the
+ *       name instead might be an option.
+ *
+ * TODO: instead of having this function ignore of keywords for which the
+ *       corresponding output is NULL, the calling routine should prepare the
+ *       correct list.
+ */
+static int
+parse_ufunc_keywords(PyUFuncObject *ufunc, PyObject *kwds, PyObject **kwnames, ...)
+{
+    va_list va;
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+    typedef int converter(PyObject *, void *);
+
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        int i;
+        converter *convert;
+        void *output = NULL;
+        npy_intp index = locate_key(kwnames, key);
+        if (index < 0) {
+            return -1;
+        }
+        if (kwnames[index]) {
+            va_start(va, kwnames);
+            for (i = 0; i <= index; i++) {
+                convert = va_arg(va, converter *);
+                output = va_arg(va, void *);
+            }
+            va_end(va);
+        }
+        if (output) {
+            if (!convert(value, output)) {
+                return -1;
+            }
+        }
+        else {
+#if PY_VERSION_HEX >= 0x03000000
+            PyErr_Format(PyExc_TypeError,
+                         "'%S' is an invalid keyword to ufunc '%s'",
+                         key, ufunc_get_name_cstr(ufunc));
+#else
+            char *str = PyString_AsString(key);
+            if (str == NULL) {
+                PyErr_Clear();
+                PyErr_SetString(PyExc_TypeError, "invalid keyword argument");
+            }
+            else {
+                PyErr_Format(PyExc_TypeError,
+                             "'%s' is an invalid keyword to ufunc '%s'",
+                             str, ufunc_get_name_cstr(ufunc));
+            }
+#endif
+            return -1;
+        }
+    }
+    return 0;
+}
+
+/*
+ * Converters for use in parsing of keywords arguments.
+ */
+NPY_NO_EXPORT int
+_subok_converter(PyObject *obj, int *subok)
+{
+    if (PyBool_Check(obj)) {
+        *subok = (obj == Py_True);
+        return NPY_SUCCEED;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "'subok' must be a boolean");
+        return NPY_FAIL;
+    }
+}
+
+NPY_NO_EXPORT int
+_keepdims_converter(PyObject *obj, int *keepdims)
+{
+    if (PyBool_Check(obj)) {
+        *keepdims = (obj == Py_True);
+        return NPY_SUCCEED;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "'keepdims' must be a boolean");
+        return NPY_FAIL;
+    }
+}
+
+NPY_NO_EXPORT int
+_wheremask_converter(PyObject *obj, PyArrayObject **wheremask)
+{
+    /*
+     * Optimization: where=True is the same as no where argument.
+     * This lets us document True as the default.
+     */
+    if (obj == Py_True) {
+        return NPY_SUCCEED;
+    }
+    else {
+        PyArray_Descr *dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (dtype == NULL) {
+            return NPY_FAIL;
+        }
+        /* PyArray_FromAny steals reference to dtype, even on failure */
+        *wheremask = (PyArrayObject *)PyArray_FromAny(obj, dtype, 0, 0, 0, NULL);
+        if ((*wheremask) == NULL) {
+            return NPY_FAIL;
+        }
+        return NPY_SUCCEED;
+    }
+}
+
+NPY_NO_EXPORT int
+_new_reference(PyObject *obj, PyObject **out)
+{
+    Py_INCREF(obj);
+    *out = obj;
+    return NPY_SUCCEED;
+}
+
+NPY_NO_EXPORT int
+_borrowed_reference(PyObject *obj, PyObject **out)
+{
+    *out = obj;
+    return NPY_SUCCEED;
+}
+
+/*
  * Parses the positional and keyword arguments for a generic ufunc call.
  * All returned arguments are new references (with optional ones NULL
  * if not present)
@@ -575,12 +750,9 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
     int nout = ufunc->nout;
     int nop = ufunc->nargs;
     PyObject *obj, *context;
-    PyObject *str_key_obj = NULL;
-    const char *ufunc_name = ufunc_get_name_cstr(ufunc);
-    int has_sig = 0;
-
+    PyArray_Descr *dtype = NULL;
     /*
-     * Initialize objects so caller knows when outputs and other optional
+     * Initialize output objects so caller knows when outputs and optional
      * arguments are set (also means we can safely XDECREF on failure).
      */
     for (i = 0; i < nop; i++) {
@@ -646,253 +818,149 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
     }
 
     /*
-     * Get keyword output and other arguments.
-     * Raise an error if anything else is present in the
-     * keyword dictionary.
+     * If keywords are present, get keyword output and other arguments.
+     * Raise an error if anything else is present in the keyword dictionary.
      */
-    if (kwds != NULL) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwds, &pos, &key, &value)) {
-            Py_ssize_t length = 0;
-            char *str = NULL;
-            int bad_arg = 1;
-
-#if defined(NPY_PY3K)
-            Py_XDECREF(str_key_obj);
-            str_key_obj = PyUnicode_AsASCIIString(key);
-            if (str_key_obj != NULL) {
-                key = str_key_obj;
-            }
-#endif
-
-            if (PyBytes_AsStringAndSize(key, &str, &length) < 0) {
-                PyErr_Clear();
-                PyErr_SetString(PyExc_TypeError, "invalid keyword argument");
+    if (kwds) {
+        PyObject *out_kwd = NULL;
+        PyObject *sig = NULL;
+        static PyObject *kwnames[13] = {NULL};
+        if (kwnames[0] == NULL) {
+            kwnames[0] = npy_um_str_out;
+            kwnames[1] = npy_um_str_where;
+            kwnames[2] = npy_um_str_axes;
+            kwnames[3] = npy_um_str_axis;
+            kwnames[4] = npy_um_str_keepdims;
+            kwnames[5] = npy_um_str_casting;
+            kwnames[6] = npy_um_str_order;
+            kwnames[7] = npy_um_str_dtype;
+            kwnames[8] = npy_um_str_subok;
+            kwnames[9] = npy_um_str_signature;
+            kwnames[10] = npy_um_str_sig;
+            kwnames[11] = npy_um_str_extobj;
+            kwnames[12] = NULL;  /* sentinel */
+        }
+        /*
+         * Parse using converters to calculate outputs
+         * (NULL outputs are treated as indicating a keyword is not allowed).
+         */
+        if (parse_ufunc_keywords(
+                ufunc, kwds, kwnames,
+                _borrowed_reference, &out_kwd,
+                _wheremask_converter, out_wheremask,  /* new reference */
+                _new_reference, out_axes,
+                _new_reference, out_axis,
+                _keepdims_converter, out_keepdims,
+                PyArray_CastingConverter, out_casting,
+                PyArray_OrderConverter, out_order,
+                PyArray_DescrConverter2, &dtype,   /* new reference */
+                _subok_converter, out_subok,
+                _new_reference, out_typetup,
+                _borrowed_reference, &sig,
+                _new_reference, out_extobj) < 0) {
+            goto fail;
+        }
+        /*
+         * Check that outputs were not passed as positional as well,
+         * and that they are either None or an array.
+         */
+        if (out_kwd) {  /* borrowed reference */
+            /*
+             * Output arrays are generally specified as a tuple of arrays
+             * and None, but may be a single array or None for ufuncs
+             * with a single output.
+             */
+            if (nargs > nin) {
+                PyErr_SetString(PyExc_ValueError,
+                                "cannot specify 'out' as both a "
+                                "positional and keyword argument");
                 goto fail;
             }
-
-            switch (str[0]) {
-                case 'a':
-                    /* possible axes argument for generalized ufunc */
-                    if (out_axes != NULL && strcmp(str, "axes") == 0) {
-                        if (out_axis != NULL && *out_axis != NULL) {
-                            PyErr_SetString(PyExc_TypeError,
-                                "cannot specify both 'axis' and 'axes'");
-                            goto fail;
-                        }
-                        Py_INCREF(value);
-                        *out_axes = value;
-                        bad_arg = 0;
-                    }
-                    else if (out_axis != NULL && strcmp(str, "axis") == 0) {
-                        if (out_axes != NULL && *out_axes != NULL) {
-                            PyErr_SetString(PyExc_TypeError,
-                                "cannot specify both 'axis' and 'axes'");
-                            goto fail;
-                        }
-                        Py_INCREF(value);
-                        *out_axis = value;
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'c':
-                    /* Provides a policy for allowed casting */
-                    if (strcmp(str, "casting") == 0) {
-                        if (!PyArray_CastingConverter(value, out_casting)) {
-                            goto fail;
-                        }
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'd':
-                    /* Another way to specify 'sig' */
-                    if (strcmp(str, "dtype") == 0) {
-                        /* Allow this parameter to be None */
-                        PyArray_Descr *dtype;
-                        if (!PyArray_DescrConverter2(value, &dtype)) {
-                            goto fail;
-                        }
-                        if (dtype != NULL) {
-                            if (*out_typetup != NULL) {
-                                PyErr_SetString(PyExc_RuntimeError,
-                                    "cannot specify both 'signature' and 'dtype'");
-                                goto fail;
-                            }
-                            *out_typetup = Py_BuildValue("(N)", dtype);
-                        }
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'e':
-                    /*
-                     * Overrides the global parameters buffer size,
-                     * error mask, and error object
-                     */
-                    if (strcmp(str, "extobj") == 0) {
-                        Py_INCREF(value);
-                        *out_extobj = value;
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'k':
-                    if (out_keepdims != NULL && strcmp(str, "keepdims") == 0) {
-                        if (!PyBool_Check(value)) {
-                            PyErr_SetString(PyExc_TypeError,
-                                        "'keepdims' must be a boolean");
-                            goto fail;
-                        }
-                        *out_keepdims = (value == Py_True);
-                        bad_arg = 0;
+            if (PyTuple_CheckExact(out_kwd)) {
+                if (PyTuple_GET_SIZE(out_kwd) != nout) {
+                    PyErr_SetString(PyExc_ValueError,
+                                    "The 'out' tuple must have exactly "
+                                    "one entry per ufunc output");
+                    goto fail;
+                }
+                /* 'out' must be a tuple of arrays and Nones */
+                for(i = 0; i < nout; ++i) {
+                    PyObject *val = PyTuple_GET_ITEM(out_kwd, i);
+                    if (_set_out_array(val, out_op+nin+i) < 0) {
+                        goto fail;
                     }
-                    break;
-                case 'o':
-                    /*
-                     * Output arrays may be specified as a keyword argument,
-                     * either as a single array or None for single output
-                     * ufuncs, or as a tuple of arrays and Nones.
-                     */
-                    if (strcmp(str, "out") == 0) {
-                        if (nargs > nin) {
-                            PyErr_SetString(PyExc_ValueError,
-                                    "cannot specify 'out' as both a "
-                                    "positional and keyword argument");
-                            goto fail;
-                        }
-                        if (PyTuple_CheckExact(value)) {
-                            if (PyTuple_GET_SIZE(value) != nout) {
-                                PyErr_SetString(PyExc_ValueError,
-                                        "The 'out' tuple must have exactly "
-                                        "one entry per ufunc output");
-                                goto fail;
-                            }
-                            /* 'out' must be a tuple of arrays and Nones */
-                            for(i = 0; i < nout; ++i) {
-                                PyObject *val = PyTuple_GET_ITEM(value, i);
-                                if (_set_out_array(val, out_op+nin+i) < 0) {
-                                    goto fail;
-                                }
-                            }
-                        }
-                        else if (nout == 1) {
-                            /* Can be an array if it only has one output */
-                            if (_set_out_array(value, out_op + nin) < 0) {
-                                goto fail;
-                            }
-                        }
-                        else {
-                            /*
-                             * If the deprecated behavior is ever removed,
-                             * keep only the else branch of this if-else
-                             */
-                            if (PyArray_Check(value) || value == Py_None) {
-                                if (DEPRECATE("passing a single array to the "
-                                              "'out' keyword argument of a "
-                                              "ufunc with\n"
-                                              "more than one output will "
-                                              "result in an error in the "
-                                              "future") < 0) {
-                                    /* The future error message */
-                                    PyErr_SetString(PyExc_TypeError,
+                }
+            }
+            else if (nout == 1) {
+                /* Can be an array if it only has one output */
+                if (_set_out_array(out_kwd, out_op + nin) < 0) {
+                    goto fail;
+                }
+            }
+            else {
+                /*
+                 * If the deprecated behavior is ever removed,
+                 * keep only the else branch of this if-else
+                 */
+                if (PyArray_Check(out_kwd) || out_kwd == Py_None) {
+                    if (DEPRECATE("passing a single array to the "
+                                  "'out' keyword argument of a "
+                                  "ufunc with\n"
+                                  "more than one output will "
+                                  "result in an error in the "
+                                  "future") < 0) {
+                        /* The future error message */
+                        PyErr_SetString(PyExc_TypeError,
                                         "'out' must be a tuple of arrays");
-                                    goto fail;
-                                }
-                                if (_set_out_array(value, out_op+nin) < 0) {
-                                    goto fail;
-                                }
-                            }
-                            else {
-                                PyErr_SetString(PyExc_TypeError,
-                                    nout > 1 ? "'out' must be a tuple "
-                                               "of arrays" :
-                                               "'out' must be an array or a "
-                                               "tuple of a single array");
-                                goto fail;
-                            }
-                        }
-                        bad_arg = 0;
+                        goto fail;
                     }
-                    /* Allows the default output layout to be overridden */
-                    else if (strcmp(str, "order") == 0) {
-                        if (!PyArray_OrderConverter(value, out_order)) {
-                            goto fail;
-                        }
-                        bad_arg = 0;
+                    if (_set_out_array(out_kwd, out_op+nin) < 0) {
+                        goto fail;
                     }
-                    break;
-                case 's':
-                    /* Allows a specific function inner loop to be selected */
-                    if (strcmp(str, "sig") == 0 ||
-                            strcmp(str, "signature") == 0) {
-                        if (has_sig == 1) {
-                            PyErr_SetString(PyExc_ValueError,
+                }
+                else {
+                    PyErr_SetString(PyExc_TypeError,
+                                    nout > 1 ? "'out' must be a tuple "
+                                    "of arrays" :
+                                    "'out' must be an array or a "
+                                    "tuple of a single array");
+                    goto fail;
+                }
+            }
+        }
+        /*
+         * Check we did not get both axis and axes, or multiple ways
+         * to define a signature.
+         */
+        if (out_axes != NULL && out_axis != NULL &&
+            *out_axes != NULL && *out_axis != NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                            "cannot specify both 'axis' and 'axes'");
+            goto fail;
+        }
+        if (sig) {  /* borrowed reference */
+            if (*out_typetup != NULL) {
+                PyErr_SetString(PyExc_ValueError,
                                 "cannot specify both 'sig' and 'signature'");
-                            goto fail;
-                        }
-                        if (*out_typetup != NULL) {
-                            PyErr_SetString(PyExc_RuntimeError,
-                                    "cannot specify both 'signature' and 'dtype'");
-                            goto fail;
-                        }
-                        Py_INCREF(value);
-                        *out_typetup = value;
-                        bad_arg = 0;
-                        has_sig = 1;
-                    }
-                    else if (strcmp(str, "subok") == 0) {
-                        if (!PyBool_Check(value)) {
-                            PyErr_SetString(PyExc_TypeError,
-                                        "'subok' must be a boolean");
-                            goto fail;
-                        }
-                        *out_subok = (value == Py_True);
-                        bad_arg = 0;
-                    }
-                    break;
-                case 'w':
-                    /*
-                     * Provides a boolean array 'where=' mask if
-                     * out_wheremask is supplied.
-                     */
-                    if (out_wheremask != NULL && strcmp(str, "where") == 0) {
-                        PyArray_Descr *dtype;
-                        dtype = PyArray_DescrFromType(NPY_BOOL);
-                        if (dtype == NULL) {
-                            goto fail;
-                        }
-                        if (value == Py_True) {
-                            /*
-                             * Optimization: where=True is the same as no
-                             * where argument. This lets us document it as a
-                             * default argument
-                             */
-                            bad_arg = 0;
-                            break;
-                        }
-                        *out_wheremask = (PyArrayObject *)PyArray_FromAny(
-                                                            value, dtype,
-                                                            0, 0, 0, NULL);
-                        if (*out_wheremask == NULL) {
-                            goto fail;
-                        }
-                        bad_arg = 0;
-                    }
-                    break;
+                goto fail;
             }
-
-            if (bad_arg) {
-                char *format = "'%s' is an invalid keyword to ufunc '%s'";
-                PyErr_Format(PyExc_TypeError, format, str, ufunc_name);
+            Py_INCREF(sig);
+            *out_typetup = sig;
+        }
+        if (dtype) {  /* new reference */
+            if (*out_typetup != NULL) {
+                PyErr_SetString(PyExc_RuntimeError,
+                                "cannot specify both 'signature' and 'dtype'");
                 goto fail;
             }
+            /* Note: "N" uses the reference */
+            *out_typetup = Py_BuildValue("(N)", dtype);
         }
     }
-    Py_XDECREF(str_key_obj);
-
     return 0;
 
 fail:
-    Py_XDECREF(str_key_obj);
+    Py_XDECREF(dtype);
     Py_XDECREF(*out_typetup);
     Py_XDECREF(*out_extobj);
     if (out_wheremask != NULL) {
diff --git a/numpy/core/src/umath/ufunc_object.h b/numpy/core/src/umath/ufunc_object.h
index d6fd3837a..5438270f1 100644
--- a/numpy/core/src/umath/ufunc_object.h
+++ b/numpy/core/src/umath/ufunc_object.h
@@ -10,13 +10,23 @@ ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
 NPY_NO_EXPORT const char*
 ufunc_get_name_cstr(PyUFuncObject *ufunc);
 
-/* interned strings (on umath import) */
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_out;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_subok;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_array_prepare;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_array_wrap;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_array_finalize;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_ufunc;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_um_str_pyvals_name;
+/* strings from umathmodule.c that are interned on umath import */
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_out;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_where;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_axes;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_axis;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_keepdims;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_casting;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_order;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_dtype;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_subok;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_signature;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_sig;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_extobj;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_prepare;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_wrap;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_finalize;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_ufunc;
+NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_pyvals_name;
 
 #endif
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 5567b9bbf..9291a5138 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -226,20 +226,40 @@ add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
  *****************************************************************************
  */
 
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_out = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_subok = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_array_prepare = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_array_wrap = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_array_finalize = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_ufunc = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_um_str_pyvals_name = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_out = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_where = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_axes = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_axis = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_keepdims = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_casting = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_order = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_dtype = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_subok = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_signature = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_sig = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_extobj = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_prepare = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_wrap = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_finalize = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_ufunc = NULL;
+NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_pyvals_name = NULL;
 
 /* intern some strings used in ufuncs */
 static int
 intern_strings(void)
 {
     npy_um_str_out = PyUString_InternFromString("out");
+    npy_um_str_where = PyUString_InternFromString("where");
+    npy_um_str_axes = PyUString_InternFromString("axes");
+    npy_um_str_axis = PyUString_InternFromString("axis");
+    npy_um_str_keepdims = PyUString_InternFromString("keepdims");
+    npy_um_str_casting = PyUString_InternFromString("casting");
+    npy_um_str_order = PyUString_InternFromString("order");
+    npy_um_str_dtype = PyUString_InternFromString("dtype");
     npy_um_str_subok = PyUString_InternFromString("subok");
+    npy_um_str_signature = PyUString_InternFromString("signature");
+    npy_um_str_sig = PyUString_InternFromString("sig");
+    npy_um_str_extobj = PyUString_InternFromString("extobj");
     npy_um_str_array_prepare = PyUString_InternFromString("__array_prepare__");
     npy_um_str_array_wrap = PyUString_InternFromString("__array_wrap__");
     npy_um_str_array_finalize = PyUString_InternFromString("__array_finalize__");
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index a72079218..8ce374a75 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -16,7 +16,7 @@ for size, char in zip(sizes, chars):
     global_size_dict[char] = size
 
 
-class TestEinSum(object):
+class TestEinsum(object):
     def test_einsum_errors(self):
         for do_opt in [True, False]:
             # Need enough arguments
@@ -614,7 +614,7 @@ class TestEinSum(object):
         np.einsum(a, [0, 51], b, [51, 2], [0, 2], optimize=False)
         assert_raises(ValueError, lambda: np.einsum(a, [0, 52], b, [52, 2], [0, 2], optimize=False))
         assert_raises(ValueError, lambda: np.einsum(a, [-1, 5], b, [5, 2], [-1, 2], optimize=False))
-        
+
     def test_einsum_broadcast(self):
         # Issue #2455 change in handling ellipsis
         # remove the 'middle broadcast' error
@@ -735,19 +735,22 @@ class TestEinSum(object):
         res = np.einsum('...ij,...jk->...ik', a, a, out=a)
         assert res is a
 
-    def optimize_compare(self, string):
+    def optimize_compare(self, subscripts, operands=None):
         # Tests all paths of the optimization function against
         # conventional einsum
-        operands = [string]
-        terms = string.split('->')[0].split(',')
-        for term in terms:
-            dims = [global_size_dict[x] for x in term]
-            operands.append(np.random.rand(*dims))
-
-        noopt = np.einsum(*operands, optimize=False)
-        opt = np.einsum(*operands, optimize='greedy')
+        if operands is None:
+            args = [subscripts]
+            terms = subscripts.split('->')[0].split(',')
+            for term in terms:
+                dims = [global_size_dict[x] for x in term]
+                args.append(np.random.rand(*dims))
+        else:
+            args = [subscripts] + operands
+
+        noopt = np.einsum(*args, optimize=False)
+        opt = np.einsum(*args, optimize='greedy')
         assert_almost_equal(opt, noopt)
-        opt = np.einsum(*operands, optimize='optimal')
+        opt = np.einsum(*args, optimize='optimal')
         assert_almost_equal(opt, noopt)
 
     def test_hadamard_like_products(self):
@@ -833,8 +836,28 @@ class TestEinSum(object):
         b = np.einsum('bbcdc->d', a)
         assert_equal(b, [12])
 
+    def test_broadcasting_dot_cases(self):
+        # Ensures broadcasting cases are not mistaken for GEMM
 
-class TestEinSumPath(object):
+        a = np.random.rand(1, 5, 4)
+        b = np.random.rand(4, 6)
+        c = np.random.rand(5, 6)
+        d = np.random.rand(10)
+
+        self.optimize_compare('ijk,kl,jl', operands=[a, b, c])
+        self.optimize_compare('ijk,kl,jl,i->i', operands=[a, b, c, d])
+
+        e = np.random.rand(1, 1, 5, 4)
+        f = np.random.rand(7, 7)
+        self.optimize_compare('abjk,kl,jl', operands=[e, b, c])
+        self.optimize_compare('abjk,kl,jl,ab->ab', operands=[e, b, c, f])
+
+        # Edge case found in gh-11308
+        g = np.arange(64).reshape(2, 4, 8)
+        self.optimize_compare('obk,ijk->ioj', operands=[g, g])
+
+
+class TestEinsumPath(object):
     def build_operands(self, string, size_dict=global_size_dict):
 
         # Builds views based off initial operands
@@ -880,7 +903,7 @@ class TestEinSumPath(object):
         long_test1 = self.build_operands('acdf,jbje,gihb,hfac,gfac,gifabc,hfac')
         path, path_str = np.einsum_path(*long_test1, optimize='greedy')
         self.assert_path_equal(path, ['einsum_path',
-                                      (1, 4), (2, 4), (1, 4), (1, 3), (1, 2), (0, 1)])
+                                      (3, 6), (3, 4), (2, 4), (2, 3), (0, 2), (0, 1)])
 
         path, path_str = np.einsum_path(*long_test1, optimize='optimal')
         self.assert_path_equal(path, ['einsum_path',
@@ -889,10 +912,12 @@ class TestEinSumPath(object):
         # Long test 2
         long_test2 = self.build_operands('chd,bde,agbc,hiad,bdi,cgh,agdb')
         path, path_str = np.einsum_path(*long_test2, optimize='greedy')
+        print(path)
         self.assert_path_equal(path, ['einsum_path',
                                       (3, 4), (0, 3), (3, 4), (1, 3), (1, 2), (0, 1)])
 
         path, path_str = np.einsum_path(*long_test2, optimize='optimal')
+        print(path)
         self.assert_path_equal(path, ['einsum_path',
                                       (0, 5), (1, 4), (3, 4), (1, 3), (1, 2), (0, 1)])
 
@@ -926,7 +951,7 @@ class TestEinSumPath(object):
         # Edge test4
         edge_test4 = self.build_operands('dcc,fce,ea,dbf->ab')
         path, path_str = np.einsum_path(*edge_test4, optimize='greedy')
-        self.assert_path_equal(path, ['einsum_path', (0, 3), (0, 2), (0, 1)])
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 1), (0, 1)])
 
         path, path_str = np.einsum_path(*edge_test4, optimize='optimal')
         self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 2), (0, 1)])
@@ -949,7 +974,7 @@ class TestEinSumPath(object):
         self.assert_path_equal(path, ['einsum_path', (0, 1, 2, 3)])
 
         path, path_str = np.einsum_path(*path_test, optimize=True)
-        self.assert_path_equal(path, ['einsum_path', (0, 3), (0, 2), (0, 1)])
+        self.assert_path_equal(path, ['einsum_path', (1, 2), (0, 1), (0, 1)])
 
         exp_path = ['einsum_path', (0, 2), (0, 2), (0, 1)]
         path, path_str = np.einsum_path(*path_test, optimize=exp_path)
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index cbcd3e994..276cd9f93 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -329,6 +329,21 @@ class TestIndexing(object):
         assert_raises(IndexError, a.__getitem__, ind)
         assert_raises(IndexError, a.__setitem__, ind, 0)
 
+    def test_trivial_fancy_not_possible(self):
+        # Test that the fast path for trivial assignment is not incorrectly
+        # used when the index is not contiguous or 1D, see also gh-11467.
+        a = np.arange(6)
+        idx = np.arange(6, dtype=np.intp).reshape(2, 1, 3)[:, :, 0]
+        assert_array_equal(a[idx], idx)
+
+        # this case must not go into the fast path, note that idx is
+        # a non-contiuguous none 1D array here.
+        a[idx] = -1
+        res = np.arange(6)
+        res[0] = -1
+        res[3] = -1
+        assert_array_equal(a, res)
+
     def test_nonbaseclass_values(self):
         class SubClass(np.ndarray):
             def __array_finalize__(self, old):
diff --git a/numpy/distutils/__init__.py b/numpy/distutils/__init__.py
index b794bebd7..8dd326920 100644
--- a/numpy/distutils/__init__.py
+++ b/numpy/distutils/__init__.py
@@ -17,7 +17,7 @@ try:
     # Normally numpy is installed if the above import works, but an interrupted
     # in-place build could also have left a __config__.py.  In that case the
     # next import may still fail, so keep it inside the try block.
-    from numpy.testing._private.pytesttester import PytestTester
+    from numpy._pytesttester import PytestTester
     test = PytestTester(__name__)
     del PytestTester
 except ImportError:
diff --git a/numpy/f2py/__init__.py b/numpy/f2py/__init__.py
index 5075c682d..fbb64f762 100644
--- a/numpy/f2py/__init__.py
+++ b/numpy/f2py/__init__.py
@@ -69,6 +69,6 @@ def compile(source,
         f.close()
     return status
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/fft/__init__.py b/numpy/fft/__init__.py
index bbb6ec8c7..44243b483 100644
--- a/numpy/fft/__init__.py
+++ b/numpy/fft/__init__.py
@@ -6,6 +6,6 @@ from .info import __doc__
 from .fftpack import *
 from .helper import *
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/lib/__init__.py b/numpy/lib/__init__.py
index d764cdc7e..dc40ac67b 100644
--- a/numpy/lib/__init__.py
+++ b/numpy/lib/__init__.py
@@ -46,6 +46,6 @@ __all__ += financial.__all__
 __all__ += nanfunctions.__all__
 __all__ += histograms.__all__
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 26ef3e235..9a680dd55 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -27,6 +27,7 @@ from numpy.core.fromnumeric import (
     ravel, nonzero, sort, partition, mean, any, sum
     )
 from numpy.core.numerictypes import typecodes, number
+from numpy.core.function_base import add_newdoc
 from numpy.lib.twodim_base import diag
 from .utils import deprecate
 from numpy.core.multiarray import (
@@ -3892,41 +3893,6 @@ def trapz(y, x=None, dx=1.0, axis=-1):
     return ret
 
 
-#always succeed
-def add_newdoc(place, obj, doc):
-    """
-    Adds documentation to obj which is in module place.
-
-    If doc is a string add it to obj as a docstring
-
-    If doc is a tuple, then the first element is interpreted as
-       an attribute of obj and the second as the docstring
-          (method, docstring)
-
-    If doc is a list, then each element of the list should be a
-       sequence of length two --> [(method1, docstring1),
-       (method2, docstring2), ...]
-
-    This routine never raises an error.
-
-    This routine cannot modify read-only docstrings, as appear
-    in new-style classes or built-in functions. Because this
-    routine never raises an error the caller must check manually
-    that the docstrings were changed.
-    """
-    try:
-        new = getattr(__import__(place, globals(), {}, [obj]), obj)
-        if isinstance(doc, str):
-            add_docstring(new, doc.strip())
-        elif isinstance(doc, tuple):
-            add_docstring(getattr(new, doc[0]), doc[1].strip())
-        elif isinstance(doc, list):
-            for val in doc:
-                add_docstring(getattr(new, val[0]), val[1].strip())
-    except Exception:
-        pass
-
-
 # Based on scitools meshgrid
 def meshgrid(*xi, **kwargs):
     """
diff --git a/numpy/linalg/__init__.py b/numpy/linalg/__init__.py
index 37bd27574..4b696c883 100644
--- a/numpy/linalg/__init__.py
+++ b/numpy/linalg/__init__.py
@@ -50,6 +50,6 @@ from .info import __doc__
 
 from .linalg import *
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/ma/__init__.py b/numpy/ma/__init__.py
index 34f21b8b1..36ceb1f6e 100644
--- a/numpy/ma/__init__.py
+++ b/numpy/ma/__init__.py
@@ -51,6 +51,6 @@ __all__ = ['core', 'extras']
 __all__ += core.__all__
 __all__ += extras.__all__
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/matrixlib/__init__.py b/numpy/matrixlib/__init__.py
index 3ad3a9549..777e0cd33 100644
--- a/numpy/matrixlib/__init__.py
+++ b/numpy/matrixlib/__init__.py
@@ -7,6 +7,6 @@ from .defmatrix import *
 
 __all__ = defmatrix.__all__
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/polynomial/__init__.py b/numpy/polynomial/__init__.py
index c18bebedb..85cee9ce6 100644
--- a/numpy/polynomial/__init__.py
+++ b/numpy/polynomial/__init__.py
@@ -22,6 +22,6 @@ from .hermite import Hermite
 from .hermite_e import HermiteE
 from .laguerre import Laguerre
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/random/__init__.py b/numpy/random/__init__.py
index 81cb94cc1..82aefce5f 100644
--- a/numpy/random/__init__.py
+++ b/numpy/random/__init__.py
@@ -117,6 +117,6 @@ def __RandomState_ctor():
     """
     return RandomState(seed=0)
 
-from numpy.testing._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
diff --git a/numpy/testing/__init__.py b/numpy/testing/__init__.py
index a7c85931c..a8bd4fc15 100644
--- a/numpy/testing/__init__.py
+++ b/numpy/testing/__init__.py
@@ -17,6 +17,6 @@ from ._private.nosetester import (
 
 __all__ = _private.utils.__all__ + ['TestCase', 'run_module_suite']
 
-from ._private.pytesttester import PytestTester
+from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester