162 files changed, 7102 insertions, 3245 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 230871ce1..e2eb01b04 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -22,7 +22,7 @@ jobs:
             . venv/bin/activate
             pip install cython sphinx matplotlib
             sudo apt-get update
-            sudo apt-get install -y graphviz
+            sudo apt-get install -y graphviz texlive-fonts-recommended texlive-latex-recommended texlive-latex-extra texlive-generic-extra latexmk texlive-xetex
 
       - run:
           name: build numpy
diff --git a/.gitignore b/.gitignore
index 6e3f8e041..fbdd4f784 100644
--- a/.gitignore
+++ b/.gitignore
@@ -100,6 +100,10 @@ Icon?
 ehthumbs.db
 Thumbs.db
 
+# pytest generated files #
+##########################
+/.pytest_cache
+
 # Things specific to this project #
 ###################################
 numpy/core/__svn_version__.py
diff --git a/.travis.yml b/.travis.yml
index 168a7a385..6b010e58f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -39,11 +39,13 @@ python:
 matrix:
   include:
     - python: 3.6
-      env: USE_CHROOT=1 ARCH=i386 DIST=artful PYTHON=3.6
+      env: USE_CHROOT=1 ARCH=i386 DIST=bionic PYTHON=3.6
       sudo: true
       addons:
         apt:
+          update: true
           packages:
+            - dpkg
             - debootstrap
     - python: 3.4
       env: USE_DEBUG=1
@@ -54,7 +56,6 @@ matrix:
             - cython3-dbg
             - python3-dbg
             - python3-dev
-            - python3-nose
             - python3-setuptools
     - python: 3.6
       env: USE_WHEEL=1 RUN_FULL_TESTS=1
diff --git a/README.md b/README.md
index a3d90f6cf..cd11b7bc5 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# <img alt="NumPy" src="branding/icons/numpylogo.svg" height="60">
+# <img alt="NumPy" src="https://cdn.rawgit.com/numpy/numpy/master/branding/icons/numpylogo.svg" height="60">
 
 [![Travis](https://img.shields.io/travis/numpy/numpy/master.svg?label=Travis%20CI)](https://travis-ci.org/numpy/numpy)
 [![AppVeyor](https://img.shields.io/appveyor/ci/charris/numpy/master.svg?label=AppVeyor)](https://ci.appveyor.com/project/charris/numpy)
diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py
new file mode 100644
index 000000000..83f26c9d1
--- /dev/null
+++ b/benchmarks/benchmarks/bench_lib.py
@@ -0,0 +1,25 @@
+"""Benchmarks for `numpy.lib`."""
+
+
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark
+
+import numpy as np
+
+
+class Pad(Benchmark):
+    """Benchmarks for `numpy.pad`."""
+
+    param_names = ["shape", "pad_width", "mode"]
+    params = [
+        [(1000,), (10, 100), (10, 10, 10)],
+        [1, 3, (0, 5)],
+        ["constant", "edge", "linear_ramp", "mean", "reflect", "wrap"],
+    ]
+
+    def setup(self, shape, pad_width, mode):
+        self.array = np.empty(shape)
+
+    def time_pad(self, shape, pad_width, mode):
+        np.pad(self.array, pad_width, mode)
diff --git a/benchmarks/benchmarks/bench_random.py b/benchmarks/benchmarks/bench_random.py
index 7ed3e2fa1..9d84d83d3 100644
--- a/benchmarks/benchmarks/bench_random.py
+++ b/benchmarks/benchmarks/bench_random.py
@@ -65,3 +65,18 @@ class Randint_dtype(Benchmark):
         high = self.high[name]
         np.random.randint(0, high + 1, size=10**5, dtype=name)
 
+
+class Permutation(Benchmark):
+    def setup(self):
+        self.n = 10000
+        self.a_1d = np.random.random_sample(self.n)
+        self.a_2d = np.random.random_sample((self.n, 2))
+    
+    def time_permutation_1d(self):
+        np.random.permutation(self.a_1d)
+
+    def time_permutation_2d(self):
+        np.random.permutation(self.a_2d)        
+
+    def time_permutation_int(self):
+        np.random.permutation(self.n)
diff --git a/doc/RELEASE_WALKTHROUGH.rst.txt b/doc/RELEASE_WALKTHROUGH.rst.txt
index 81e15f91f..ad14c16c1 100644
--- a/doc/RELEASE_WALKTHROUGH.rst.txt
+++ b/doc/RELEASE_WALKTHROUGH.rst.txt
@@ -1,12 +1,26 @@
-This file contains a walkthrough of the NumPy 1.12.0 release on Fedora Linux.
+This file contains a walkthrough of the NumPy 1.14.4 release on Linux.
 The commands can be copied into the command line, but be sure to
-replace 1.12.0 by the correct version.
+replace 1.14.4 by the correct version.
 
 Release  Walkthrough
 ====================
 
-Building the release
---------------------
+Update Release documentation
+----------------------------
+
+The file ``doc/changelog/1.14.4-changelog.rst`` should be updated to reflect
+the final list of changes and contributors. This text can be generated by::
+
+    $ python tools/changelog.py $GITHUB v1.14.3..maintenance/1.14.x > doc/changelog/1.14.4-changelog.rst
+
+where ``GITHUB`` contains your github access token. This text may also be
+appended to ``doc/release/1.14.4-notes.rst`` for release updates, though not
+for new releases like ``1.14.0``, as the changelogs for latter tend to be
+excessively long. The ``doc/source/release.rst`` file should also be
+updated with a link to the new release notes.
+
+Prepare the release commit
+--------------------------
 
 Checkout the branch for the release, make sure it is up to date, and clean the
 repository::
@@ -16,44 +30,48 @@ repository::
     $ git submodule update
     $ git clean -xdf
 
-Look at the git log to get the hash of the last commit in the release, then
-check it out::
-
-    $ git log
-    $ git checkout 7849751173fb47a5f17761b3515b42b4d8ce1197
-
 Edit pavement.py and setup.py as detailed in HOWTO_RELEASE::
 
     $ gvim pavement.py setup.py
-    $ git commit -a -m"REL: NumPy 1.14.1 release."
+    $ git commit -a -m"REL: NumPy 1.14.4 release."
 
 Sanity check::
 
     $ python runtests.py -m "full"
     $ python3 runtests.py -m "full"
 
-Tag it,and build the source distribution archives::
+Push this release directly onto the end of the maintenance branch. This
+requires write permission to the numpy repository::
 
-    $ git tag -s v1.14.1
-    $ paver sdist # sdist will do a git clean -xdf, so we omit that
+    $ git push upstream maintenance/1.14.x
 
-Check that the files in ``release/installers`` have the correct versions, then
-push the tag upstream; generation of the wheels for PyPI needs it::
+As an example, see the 1.14.3 REL commit: `<https://github.com/numpy/numpy/commit/73299826729be58cec179b52c656adfcaefada93>`_.
 
-    $ git push upstream v1.14.1
+Build source releases
+---------------------
 
-Trigger the wheels build. This can take a while. The numpy-wheels repository is
-cloned from `<https://github.com/MacPython/numpy-wheels>`_. Start with a pull
-as the repo may have been accessed and changed by someone else and a push will
-fail.
+Paver is used to build the source releases. It will create the ``release`` and
+``release/installers`` directories and put the ``*.zip`` and ``*.tar.gz``
+source releases in the latter.
+
+    $ paver sdist # sdist will do a git clean -xdf, so we omit that
+
+Build wheels
+------------
+
+Trigger the wheels build by pointing the numpy-wheels repository at this
+commit. This can take a while. The numpy-wheels repository is cloned from
+`<https://github.com/MacPython/numpy-wheels>`_. Start with a pull as the repo
+may have been accessed and changed by someone else and a push will fail::
 
     $ cd ../numpy-wheels
     $ git pull origin master
     $ git branch <new version>  # only when starting new numpy version
-    $ git checkout v1.14.x  # v1.14.x already existed for the 1.14.1 release
+    $ git checkout v1.14.x  # v1.14.x already existed for the 1.14.4 release
 
-The ``.travis.yml`` and ``appveyor.yml`` files need to be edited to make
-sure they have the correct version, search for ``BUILD_COMMIT``.
+Edit the ``.travis.yml`` and ``.appveyor.yml`` files to make sure they have the
+correct version, and put in the commit hash for the ``REL`` commit created
+above for ``BUILD_COMMIT``, see the _example from `v1.14.3`::
 
     $ gvim .travis.yml appveyor.yml
     $ git commit -a
@@ -65,41 +83,76 @@ provided at `<https://github.com/MacPython/numpy-wheels>`_ to check the travis
 and appveyor build status. Check if all the needed wheels have been built and
 uploaded before proceeding. There should currently be 22 of them at
 `<https://wheels.scipy.org>`_, 4 for Mac, 8 for Windows, and 10 for Linux.
+Note that sometimes builds, like tests, fail for unrelated reasons and you will
+need to restart them.
 
+.. example_: https://github.com/MacPython/numpy-wheels/commit/fed9c04629c155e7804282eb803d81097244598d
 
 Download wheels
 ---------------
 
-When the wheels have all been built, download them using the ``wheel-uploader``
+When the wheels have all been successfully built, download them using the ``wheel-uploader``
 in the ``terryfy`` repository.  The terryfy repository may be cloned from
 `<https://github.com/MacPython/terryfy>`_ if you don't already have it.  The
 wheels can also be uploaded using the ``wheel-uploader``, but we prefer to
 download all the wheels to the ``../numpy/release/installers`` directory and
-upload later using ``twine``.
+upload later using ``twine``::
 
     $ cd ../terryfy
     $ git pull origin master
     $ CDN_URL=https://3f23b170c54c2533c070-1c8a9b3114517dc5fe17b7c3f8c63a43.ssl.cf2.rackcdn.com
     $ NPY_WHLS=../numpy/release/installers
-    $ ./wheel-uploader -u $CDN_URL -n -v -w $NPY_WHLS -t win numpy 1.14.1
-    $ ./wheel-uploader -u $CDN_URL -n -v -w $NPY_WHLS -t manylinux1 numpy 1.14.1
-    $ ./wheel-uploader -u $CDN_URL -n -v -w $NPY_WHLS -t macosx numpy 1.14.1
+    $ ./wheel-uploader -u $CDN_URL -n -v -w $NPY_WHLS -t win numpy 1.14.4
+    $ ./wheel-uploader -u $CDN_URL -n -v -w $NPY_WHLS -t manylinux1 numpy 1.14.4
+    $ ./wheel-uploader -u $CDN_URL -n -v -w $NPY_WHLS -t macosx numpy 1.14.4
 
 If you do this often, consider making CDN_URL and NPY_WHLS part of your default
-environment.
+environment. Note that we need local copies of the files in order to generate
+hashes to include in the README files generated later.
+
+Tag the release
+---------------
+
+Once the wheels have been built and downloaded without errors, go back to your
+numpy repository in the maintenance branch and tag the ``REL`` commit, signing
+it with your gpg key, and build the source distribution archives::
 
+    $ git tag -s v1.14.4
+
+You should upload your public gpg key to github, so that the tag will appear
+"verified" there.
+
+Check that the files in ``release/installers`` have the correct versions, then
+push the tag upstream::
+
+    $ git push upstream v1.14.4
+
+We wait until this point to push the tag because it is very difficult to change
+the tag after it has been pushed.
+
+Reset the maintenance branch into a development state
+-----------------------------------------------------
+
+Add another ``REL`` commit to the numpy maintenance branch, which resets the
+``ISREALEASED`` flag to ``False`` and increments the version counter::
+
+    $ gvim pavement.py setup.py
+    $ git commit -a -m"REL: prepare 1.14.x for further development"
+    $ git push upstream maintenance/1.14.x
+
+This strategy is copied from the scipy release procedure and was used in numpy
+for the first time in 1.14.3. It needed to be modified a little since numpy
+has more strict requirements for the version number.
 
 Upload to PyPI
 --------------
 
-Upload to PyPI using ``twine``.  The choice here is to sign the files, so will
-need to sign every file separately when they are uploaded, keeping the gpg pass
-phrase in the clipboard and pasting it in will make that easier. We may chose
-to forgo the signing in the future::
+Upload to PyPI using ``twine``. A recent version of ``twine`` of is needed
+after recent PyPI changes, version ``1.11.0`` was used here. ::
 
     $ cd ../numpy
-    $ twine upload -s release/installers/*.whl
-    $ twine upload -s release/installers/numpy-1.14.1.zip  # Upload last.
+    $ twine upload release/installers/*.whl
+    $ twine upload release/installers/numpy-1.14.4.zip  # Upload last.
 
 If one of the commands breaks in the middle, which is not uncommon, you may
 need to selectively upload the remaining files because PyPI does not allow the
@@ -108,27 +161,22 @@ avoid synchronization problems if pip users access the files while this is in
 process. Note that PyPI only allows a single source distribution, here we have
 chosen the zip archive.
 
-If this is not a final release, log into PyPI and hide the new directory while
-making sure the last stable release is visible.
-
-
 Upload files to github
 ----------------------
 
-Generate the ``release/README`` files::
+Generate the ``release/README.*`` files::
 
-    $ rm release/installers/*.asc
     $ paver write_release_and_log
 
-Go to `<https://github.com/numpy/numpy/releases>`_, there should be a ``v1.14.1
+Go to `<https://github.com/numpy/numpy/releases>`_, there should be a ``v1.14.4
 tag``, click on it and hit the edit button for that tag. There are two ways to
 add files, using an editable text window and as binary uploads.
 
 - Cut and paste the ``release/README.md`` file contents into the text window.
-- Upload ``release/installers/numpy-1.12.0.tar.gz`` as a binary file.
-- Upload ``release/installers/numpy-1.12.0.zip`` as a binary file.
-- Upload ``release/README`` as a binary file.
-- Upload ``doc/changelog/1.14.1-changelog.rst`` as a binary file.
+- Upload ``release/installers/numpy-1.14.4.tar.gz`` as a binary file.
+- Upload ``release/installers/numpy-1.14.4.zip`` as a binary file.
+- Upload ``release/README.rst`` as a binary file.
+- Upload ``doc/changelog/1.14.4-changelog.rst`` as a binary file.
 - Check the pre-release button if this is a pre-releases.
 - Hit the ``{Publish,Update} release`` button at the bottom.
 
@@ -143,7 +191,7 @@ upload the documentation. Otherwise::
 
     $ pushd doc
     $ make dist
-    $ make upload USERNAME=<yourname> RELEASE=v1.14.1
+    $ make upload USERNAME=<yourname> RELEASE=v1.14.4
     $ popd
 
 If the release series is a new one, you will need to rebuild and upload the
@@ -164,7 +212,7 @@ This assumes that you have forked `<https://github.com/scipy/scipy.org>`_::
     $ cd ../scipy.org
     $ git checkout master
     $ git pull upstream master
-    $ git checkout -b numpy-1.14.1
+    $ git checkout -b numpy-1.14.4
     $ gvim www/index.rst # edit the News section
     $ git commit -a
     $ git push origin HEAD
@@ -176,11 +224,5 @@ Announce to mailing lists
 
 The release should be announced on the numpy-discussion, scipy-devel,
 scipy-user, and python-announce-list mailing lists. Look at previous
-announcements for the basic template. The contributor list can be generated as
-follows::
-
-    $ cd ../numpy
-    $ ./tools/changelog.py  $GITHUB v1.14.0..v1.14.1 > tmp.rst
-
-The contents of ``tmp.rst`` can then be cut and pasted into the announcement
-email.
+announcements for the basic template. The contributor and PR lists
+are the same as generated for the release notes above.
diff --git a/doc/changelog/1.14.3-changelog.rst b/doc/changelog/1.14.3-changelog.rst
new file mode 100644
index 000000000..784a9177f
--- /dev/null
+++ b/doc/changelog/1.14.3-changelog.rst
@@ -0,0 +1,27 @@
+
+Contributors
+============
+
+A total of 6 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Allan Haldane
+* Charles Harris
+* Jonathan March +
+* Malcolm Smith +
+* Matti Picus
+* Pauli Virtanen
+
+Pull requests merged
+====================
+
+A total of 8 pull requests were merged for this release.
+
+* `#10862 <https://github.com/numpy/numpy/pull/10862>`__: BUG: floating types should override tp_print (1.14 backport)
+* `#10905 <https://github.com/numpy/numpy/pull/10905>`__: BUG: for 1.14 back-compat, accept list-of-lists in fromrecords
+* `#10947 <https://github.com/numpy/numpy/pull/10947>`__: BUG: 'style' arg to array2string broken in legacy mode (1.14...
+* `#10959 <https://github.com/numpy/numpy/pull/10959>`__: BUG: test, fix for missing flags['WRITEBACKIFCOPY'] key
+* `#10960 <https://github.com/numpy/numpy/pull/10960>`__: BUG: Add missing underscore to prototype in check_embedded_lapack
+* `#10961 <https://github.com/numpy/numpy/pull/10961>`__: BUG: Fix encoding regression in ma/bench.py (Issue #10868)
+* `#10962 <https://github.com/numpy/numpy/pull/10962>`__: BUG: core: fix NPY_TITLE_KEY macro on pypy
+* `#10974 <https://github.com/numpy/numpy/pull/10974>`__: BUG: test, fix PyArray_DiscardWritebackIfCopy...
diff --git a/doc/changelog/1.14.4-changelog.rst b/doc/changelog/1.14.4-changelog.rst
new file mode 100644
index 000000000..0bda55cf1
--- /dev/null
+++ b/doc/changelog/1.14.4-changelog.rst
@@ -0,0 +1,31 @@
+
+Contributors
+============
+
+A total of 7 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Allan Haldane
+* Charles Harris
+* Marten van Kerkwijk
+* Matti Picus
+* Pauli Virtanen
+* Ryan Soklaski +
+* Sebastian Berg
+
+Pull requests merged
+====================
+
+A total of 11 pull requests were merged for this release.
+
+* `#11104 <https://github.com/numpy/numpy/pull/11104>`__: BUG: str of DOUBLE_DOUBLE format wrong on ppc64
+* `#11170 <https://github.com/numpy/numpy/pull/11170>`__: TST: linalg: add regression test for gh-8577
+* `#11174 <https://github.com/numpy/numpy/pull/11174>`__: MAINT: add sanity-checks to be run at import time
+* `#11181 <https://github.com/numpy/numpy/pull/11181>`__: BUG: void dtype setup checked offset not actual pointer for alignment
+* `#11194 <https://github.com/numpy/numpy/pull/11194>`__: BUG: Python2 doubles don't print correctly in interactive shell.
+* `#11198 <https://github.com/numpy/numpy/pull/11198>`__: BUG: optimizing compilers can reorder call to npy_get_floatstatus
+* `#11199 <https://github.com/numpy/numpy/pull/11199>`__: BUG: reduce using SSE only warns if inside SSE loop
+* `#11203 <https://github.com/numpy/numpy/pull/11203>`__: BUG: Bytes delimiter/comments in genfromtxt should be decoded
+* `#11211 <https://github.com/numpy/numpy/pull/11211>`__: BUG: Fix reference count/memory leak exposed by better testing
+* `#11219 <https://github.com/numpy/numpy/pull/11219>`__: BUG: Fixes einsum broadcasting bug when optimize=True
+* `#11251 <https://github.com/numpy/numpy/pull/11251>`__: DOC: Document 1.14.4 release.
diff --git a/doc/neps/index.rst.tmpl b/doc/neps/index.rst.tmpl
index 8d0c5da77..6c988014f 100644
--- a/doc/neps/index.rst.tmpl
+++ b/doc/neps/index.rst.tmpl
@@ -30,6 +30,19 @@ Accepted NEPs, implementation in progress
    NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
 {% endfor %}
 
+
+Open NEPs (under consideration)
+-------------------------------
+
+.. toctree::
+   :maxdepth: 1
+
+{% for nep, tags in neps.items() if tags['Status'] == 'Draft' %}
+   NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
+{% endfor %}
+
+
+
 Implemented NEPs
 ----------------
 
@@ -40,8 +53,8 @@ Implemented NEPs
    NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
 {% endfor %}
 
-Defunct NEPs
-------------
+Deferred NEPs
+-------------
 
 .. toctree::
    :maxdepth: 1
@@ -49,3 +62,13 @@ Defunct NEPs
 {% for nep, tags in neps.items() if tags['Status'] == 'Deferred' %}
    NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
 {% endfor %}
+
+Rejected NEPs
+-------------
+
+.. toctree::
+   :maxdepth: 1
+
+{% for nep, tags in neps.items() if tags['Status'] == 'Rejected' %}
+   NEP {{ nep }} — {{ tags['Title'] }} <{{ tags['Filename'] }}>
+{% endfor %}
diff --git a/doc/neps/nep-0000.rst b/doc/neps/nep-0000.rst
index 2eed19161..9c6646db2 100644
--- a/doc/neps/nep-0000.rst
+++ b/doc/neps/nep-0000.rst
@@ -3,7 +3,7 @@ Purpose and Process
 ===================
 
 :Author: Jarrod Millman <millman@berkeley.edu>
-:Status: Draft
+:Status: Active
 :Type: Process
 :Created: 2017-12-11
 
@@ -64,12 +64,19 @@ champion (a.k.a. Author) should first attempt to ascertain whether the idea is
 suitable for a NEP. Posting to the numpy-discussion `mailing list`_ is the best
 way to go about doing this.
 
-Following a discussion on the mailing list, the proposal should be submitted as
-a draft NEP via a `GitHub pull request`_ to the ``doc/neps`` directory with the
-name ``nep-<n>.rst`` where ``<n>`` is an appropriately assigned four-digit
-number (e.g., ``nep-0000.rst``). The draft must use the :doc:`nep-template`
-file. Once a formal proposal has been submitted as a PR, it should be announced
-on the mailing list.
+The proposal should be submitted as a draft NEP via a `GitHub pull
+request`_ to the ``doc/neps`` directory with the name ``nep-<n>.rst``
+where ``<n>`` is an appropriately assigned four-digit number (e.g.,
+``nep-0000.rst``). The draft must use the :doc:`nep-template` file.
+
+Once the PR is in place, the NEP should be announced on the mailing
+list for discussion (comments on the PR itself should be restricted to
+minor editorial and technical fixes).
+
+At the earliest convenience, the PR should be merged (regardless of
+whether it is accepted during discussion).  Additional PRs may be made
+by the Author to update or expand the NEP, or by maintainers to set
+its status, discussion URL, etc.
 
 Standards Track NEPs consist of two parts, a design document and a
 reference implementation.  It is generally recommended that at least a
@@ -83,9 +90,8 @@ mark the PR as a WIP).
 Review and Resolution
 ^^^^^^^^^^^^^^^^^^^^^
 
-NEPs are discussed on the mailing list and perhaps in other forums.
-Sometimes NEPs will grow out of an existing pull request.
-The possible paths of the status of NEPs are as follows:
+NEPs are discussed on the mailing list.  The possible paths of the
+status of NEPs are as follows:
 
 .. image:: _static/nep-0000.png
 
diff --git a/doc/neps/nep-0017-split-out-maskedarray.rst b/doc/neps/nep-0017-split-out-maskedarray.rst
new file mode 100644
index 000000000..d6dcc1def
--- /dev/null
+++ b/doc/neps/nep-0017-split-out-maskedarray.rst
@@ -0,0 +1,129 @@
+=======================
+Split Out Masked Arrays
+=======================
+
+:Author: Stéfan van der Walt <stefanv@berkeley.edu>
+:Status: Rejected
+:Type: Standards Track
+:Created: 2018-03-22
+:Resolution: https://mail.python.org/pipermail/numpy-discussion/2018-May/078026.html
+
+Abstract
+--------
+
+This NEP proposes removing MaskedArray functionality from NumPy, and
+publishing it as a stand-alone package.
+
+Detailed description
+--------------------
+
+MaskedArrays are a sub-class of the NumPy ``ndarray`` that adds
+masking capabilities, i.e. the ability to ignore or hide certain array
+values during computation.
+
+While historically convenient to distribute this class inside of NumPy,
+improved packaging has made it possible to distribute it separately
+without difficulty.
+
+Motivations for this move include:
+
+ * Focus: the NumPy package should strive to only include the
+   `ndarray` object, and the essential utilities needed to manipulate
+   such arrays.
+ * Complexity: the MaskedArray implementation is non-trivial, and imposes
+   a significant maintenance burden.
+ * Compatibility: MaskedArray objects, being subclasses [1]_ of `ndarrays`,
+   often cause complications when being used with other packages.
+   Fixing these issues is outside the scope of NumPy development.
+
+This NEP proposes a deprecation pathway through which MaskedArrays
+would still be accessible to users, but no longer as part of the core
+package.
+
+Implementation
+--------------
+
+Currently, a MaskedArray is created as follows::
+
+  from numpy import ma
+  ma.array([1, 2, 3], mask=[True, False, True])
+
+This will return an array where the values 1 and 3 are masked (no
+longer visible to operations such as `np.sum`).
+
+We propose refactoring the `np.ma` subpackage into a new
+pip-installable library called `maskedarray` [2]_, which would be used
+in a similar fashion::
+
+  import maskedarray as ma
+  ma.array([1, 2, 3], mask=[True, False, True])
+
+For two releases of NumPy, `maskedarray` would become a NumPy
+dependency, and would expose MaskedArrays under the existing name,
+`np.ma`.  If imported as `np.ma`, a `NumpyDeprecationWarning` will
+be raised, describing the impending deprecation with instructions on
+how to modify code to use `maskedarray`.
+
+After two releases, `np.ma` will be removed entirely. In order to obtain
+`np.ma`, a user will install it via `pip install` or via their package
+manager. Subsequently, `importing maskedarray` on a version of NumPy that
+includes it intgrally will raise an `ImportError`.
+
+Documentation
+`````````````
+
+NumPy's internal documentation refers explicitly to MaskedArrays in
+certain places, e.g. `ndarray.concatenate`:
+
+> When one or more of the arrays to be concatenated is a MaskedArray,
+> this function will return a MaskedArray object instead of an ndarray,
+> but the input masks are *not* preserved. In cases where a MaskedArray
+> is expected as input, use the ma.concatenate function from the masked
+> array module instead.
+
+Such documentation will be removed, since the expectation is that
+users of `maskedarray` will use methods from that package to operate
+on MaskedArrays.
+
+Other appearances
+~~~~~~~~~~~~~~~~~
+
+Explicit MaskedArray support will be removed from:
+
+- `numpygenfromtext`
+- `numpy.libmerge_arrays`, `numpy.lib.stack_arrays`
+
+Backward compatibility
+----------------------
+
+For two releases of NumPy, apart from a deprecation notice, there will
+be no user visible changes.  Thereafter, `np.ma` will no longer be
+available (instead, MaskedArrays will live in the `maskedarray`
+package).
+
+Note also that new PEPs on array-like objects may eventually provide
+better support for MaskedArrays than is currently available.
+
+Alternatives
+------------
+
+After a lively discussion on the mailing list:
+
+- There is support (and active interest in) making a better *new* masked array
+  class.
+- The new class should be a consumer of the external NumPy API with no special
+  status (unlike today where there are hacks across the codebase to support it)
+- `MaskedArray` will stay where it is, at least until the new masked array
+  class materializes and has been tried in the wild.
+
+References and Footnotes
+------------------------
+
+.. [1] Subclassing ndarray,
+       https://docs.scipy.org/doc/numpy/user/basics.subclassing.html
+.. [2] PyPi: maskedarray, https://pypi.org/project/maskedarray/
+
+Copyright
+---------
+
+This document has been placed in the public domain.
diff --git a/doc/neps/nep-0018-array-function-protocol.rst b/doc/neps/nep-0018-array-function-protocol.rst
new file mode 100644
index 000000000..943ca4cbf
--- /dev/null
+++ b/doc/neps/nep-0018-array-function-protocol.rst
@@ -0,0 +1,543 @@
+==================================================
+NEP: Dispatch Mechanism for NumPy's high level API
+==================================================
+
+:Author: Stephan Hoyer <shoyer@google.com>
+:Author: Matthew Rocklin <mrocklin@gmail.com>
+:Status: Draft
+:Type: Standards Track
+:Created: 2018-05-29
+
+Abstact
+-------
+
+We propose a protocol to allow arguments of numpy functions to define
+how that function operates on them. This allows other libraries that
+implement NumPy's high level API to reuse Numpy functions. This allows
+libraries that extend NumPy's high level API to apply to more NumPy-like
+libraries.
+
+Detailed description
+--------------------
+
+Numpy's high level ndarray API has been implemented several times
+outside of NumPy itself for different architectures, such as for GPU
+arrays (CuPy), Sparse arrays (scipy.sparse, pydata/sparse) and parallel
+arrays (Dask array) as well as various Numpy-like implementations in the
+deep learning frameworks, like TensorFlow and PyTorch.
+
+Similarly there are several projects that build on top of the Numpy API
+for labeled and indexed arrays (XArray), automatic differentation
+(Autograd, Tangent), higher order array factorizations (TensorLy), etc.
+that add additional functionality on top of the Numpy API.
+
+We would like to be able to use these libraries together, for example we
+would like to be able to place a CuPy array within XArray, or perform
+automatic differentiation on Dask array code. This would be easier to
+accomplish if code written for NumPy ndarrays could also be used by
+other NumPy-like projects.
+
+For example, we would like for the following code example to work
+equally well with any Numpy-like array object:
+
+.. code:: python
+
+    def f(x):
+        y = np.tensordot(x, x.T)
+        return np.mean(np.exp(y))
+
+Some of this is possible today with various protocol mechanisms within
+Numpy.
+
+-  The ``np.exp`` function checks the ``__array_ufunc__`` protocol
+-  The ``.T`` method works using Python's method dispatch
+-  The ``np.mean`` function explicitly checks for a ``.mean`` method on
+   the argument
+
+However other functions, like ``np.tensordot`` do not dispatch, and
+instead are likely to coerce to a Numpy array (using the ``__array__``)
+protocol, or err outright. To achieve enough coverage of the NumPy API
+to support downstream projects like XArray and autograd we want to
+support *almost all* functions within Numpy, which calls for a more
+reaching protocol than just ``__array_ufunc__``. We would like a
+protocol that allows arguments of a NumPy function to take control and
+divert execution to another function (for example a GPU or parallel
+implementation) in a way that is safe and consistent across projects.
+
+Implementation
+--------------
+
+We propose adding support for a new protocol in NumPy,
+``__array_function__``.
+
+This protocol is intended to be a catch-all for NumPy functionality that
+is not covered by existing protocols, like reductions (like ``np.sum``)
+or universal functions (like ``np.exp``). The semantics are very similar
+to ``__array_ufunc__``, except the operation is specified by an
+arbitrary callable object rather than a ufunc instance and method.
+
+The interface
+~~~~~~~~~~~~~
+
+We propose the following signature for implementations of
+``__array_function__``:
+
+.. code-block:: python
+
+    def __array_function__(self, func, types, args, kwargs)
+
+-  ``func`` is an arbitrary callable exposed by NumPy's public API,
+   which was called in the form ``func(*args, **kwargs)``.
+-  ``types`` is a list of types for all arguments to the original NumPy
+   function call that will be checked for an ``__array_function__``
+   implementation.
+-  The tuple ``args`` and dict ``**kwargs`` are directly passed on from the
+   original call.
+
+Unlike ``__array_ufunc__``, there are no high-level guarantees about the
+type of ``func``, or about which of ``args`` and ``kwargs`` may contain objects
+implementing the array API. As a convenience for ``__array_function__``
+implementors of the NumPy API, the ``types`` keyword contains a list of all
+types that implement the ``__array_function__`` protocol.  This allows
+downstream implementations to quickly determine if they are likely able to
+support the operation.
+
+Still be determined: what guarantees can we offer for ``types``? Should
+we promise that types are unique, and appear in the order in which they
+are checked?
+
+Example for a project implementing the NumPy API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Most implementations of ``__array_function__`` will start with two
+checks:
+
+1.  Is the given function something that we know how to overload?
+2.  Are all arguments of a type that we know how to handle?
+
+If these conditions hold, ``__array_function__`` should return
+the result from calling its implementation for ``func(*args, **kwargs)``.
+Otherwise, it should return the sentinel value ``NotImplemented``, indicating
+that the function is not implemented by these types.
+
+.. code:: python
+
+    class MyArray:
+        def __array_function__(self, func, types, args, kwargs):
+            if func not in HANDLED_FUNCTIONS:
+                return NotImplemented
+            if not all(issubclass(t, MyArray) for t in types):
+                return NotImplemented
+            return HANDLED_FUNCTIONS[func](*args, **kwargs)
+
+    HANDLED_FUNCTIONS = {
+        np.concatenate: my_concatenate,
+        np.broadcast_to: my_broadcast_to,
+        np.sum: my_sum,
+        ...
+    }
+
+Necessary changes within the Numpy codebase itself
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This will require two changes within the Numpy codebase:
+
+1. A function to inspect available inputs, look for the
+   ``__array_function__`` attribute on those inputs, and call those
+   methods appropriately until one succeeds.  This needs to be fast in the
+   common all-NumPy case.
+
+   This is one additional function of moderate complexity.
+2. Calling this function within all relevant Numpy functions.
+
+   This affects many parts of the Numpy codebase, although with very low
+   complexity.
+
+Finding and calling the right ``__array_function__``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Given a Numpy function, ``*args`` and ``**kwargs`` inputs, we need to
+search through ``*args`` and ``**kwargs`` for all appropriate inputs
+that might have the ``__array_function__`` attribute. Then we need to
+select among those possible methods and execute the right one.
+Negotiating between several possible implementations can be complex.
+
+Finding arguments
+'''''''''''''''''
+
+Valid arguments may be directly in the ``*args`` and ``**kwargs``, such
+as in the case for ``np.tensordot(left, right, out=out)``, or they may
+be nested within lists or dictionaries, such as in the case of
+``np.concatenate([x, y, z])``. This can be problematic for two reasons:
+
+1. Some functions are given long lists of values, and traversing them
+   might be prohibitively expensive
+2. Some function may have arguments that we don't want to inspect, even
+   if they have the ``__array_function__`` method
+
+To resolve these we ask the functions to provide an explicit list of
+arguments that should be traversed. This is the ``relevant_arguments=``
+keyword in the examples below.
+
+Trying ``__array_function__`` methods until the right one works
+'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+Many arguments may implement the ``__array_function__`` protocol. Some
+of these may decide that, given the available inputs, they are unable to
+determine the correct result. How do we call the right one? If several
+are valid then which has precedence?
+
+The rules for dispatch with ``__array_function__`` match those for
+``__array_ufunc__`` (see
+`NEP-13 <http://www.numpy.org/neps/nep-0013-ufunc-overrides.html>`_).
+In particular:
+
+-  NumPy will gather implementations of ``__array_function__`` from all
+   specified inputs and call them in order: subclasses before
+   superclasses, and otherwise left to right. Note that in some edge cases,
+   this differs slightly from the
+   `current behavior <https://bugs.python.org/issue30140>`_ of Python.
+-  Implementations of ``__array_function__`` indicate that they can
+   handle the operation by returning any value other than
+   ``NotImplemented``.
+-  If all ``__array_function__`` methods return ``NotImplemented``,
+   NumPy will raise ``TypeError``.
+
+Changes within Numpy functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Given a function defined above, for now call it
+``do_array_function_dance``, we now need to call that function from
+within every relevant Numpy function. This is a pervasive change, but of
+fairly simple and innocuous code that should complete quickly and
+without effect if no arguments implement the ``__array_function__``
+protocol. Let us consider a few examples of NumPy functions and how they
+might be affected by this change:
+
+.. code:: python
+
+    def broadcast_to(array, shape, subok=False):
+        success, value = do_array_function_dance(
+            func=broadcast_to,
+            relevant_arguments=[array],
+            args=(array,),
+            kwargs=dict(shape=shape, subok=subok))
+        if success:
+            return value
+
+        ... # continue with the definition of broadcast_to
+
+    def concatenate(arrays, axis=0, out=None)
+        success, value = do_array_function_dance(
+            func=concatenate,
+            relevant_arguments=[arrays, out],
+            args=(arrays,),
+            kwargs=dict(axis=axis, out=out))
+        if success:
+            return value
+
+        ... # continue with the definition of concatenate
+
+The list of objects passed to ``relevant_arguments`` are those that should
+be inspected for ``__array_function__`` implementations.
+
+Alternatively, we could write these overloads with a decorator, e.g.,
+
+.. code:: python
+
+    @overload_for_array_function(['array'])
+    def broadcast_to(array, shape, subok=False):
+        ... # continue with the definition of broadcast_to
+
+    @overload_for_array_function(['arrays', 'out'])
+    def concatenate(arrays, axis=0, out=None):
+        ... # continue with the definition of concatenate
+
+The decorator ``overload_for_array_function`` would be written in terms
+of ``do_array_function_dance``.
+
+The downside of this approach would be a loss of introspection capability
+for NumPy functions on Python 2, since this requires the use of
+``inspect.Signature`` (only available on Python 3). However, NumPy won't
+be supporting Python 2 for `very much longer <http://www.numpy.org/neps/nep-0014-dropping-python2.7-proposal.html>`_.
+
+Use outside of NumPy
+~~~~~~~~~~~~~~~~~~~~
+
+Nothing about this protocol that is particular to NumPy itself. Should
+we enourage use of the same ``__array_function__`` protocol third-party
+libraries for overloading non-NumPy functions, e.g., for making
+array-implementation generic functionality in SciPy?
+
+This would offer significant advantages (SciPy wouldn't need to invent
+its own dispatch system) and no downsides that we can think of, because
+every function that dispatches with ``__array_function__`` already needs
+to be explicitly recognized. Libraries like Dask, CuPy, and Autograd
+already wrap a limited subset of SciPy functionality (e.g.,
+``scipy.linalg``) similarly to how they wrap NumPy.
+
+If we want to do this, we should consider exposing the helper function
+``do_array_function_dance()`` above as a public API.
+
+Non-goals
+---------
+
+We are aiming for basic strategy that can be relatively mechanistically
+applied to almost all functions in NumPy's API in a relatively short
+period of time, the development cycle of a single NumPy release.
+
+We hope to get both the ``__array_function__`` protocol and all specific
+overloads right on the first try, but our explicit aim here is to get
+something that mostly works (and can be iterated upon), rather than to
+wait for an optimal implementation. The price of moving fast is that for
+now **this protocol should be considered strictly experimental**. We
+reserve the right to change the details of this protocol and how
+specific NumPy functions use it at any time in the future -- even in
+otherwise bug-fix only releases of NumPy.
+
+In particular, we don't plan to write additional NEPs that list all
+specific functions to overload, with exactly how they should be
+overloaded. We will leave this up to the discretion of committers on
+individual pull requests, trusting that they will surface any
+controversies for discussion by interested parties.
+
+However, we already know several families of functions that should be
+explicitly exclude from ``__array_function__``. These will need their
+own protocols:
+
+-  universal functions, which already have their own protocol.
+-  ``array`` and ``asarray``, because they are explicitly intended for
+   coercion to actual ``numpy.ndarray`` object.
+-  dispatch for methods of any kind, e.g., methods on
+   ``np.random.RandomState`` objects.
+
+As a concrete example of how we expect to break behavior in the future,
+some functions such as ``np.where`` are currently not NumPy universal
+functions, but conceivably could become universal functions in the
+future. When/if this happens, we will change such overloads from using
+``__array_function__`` to the more specialized ``__array_ufunc__``.
+
+
+Backward compatibility
+----------------------
+
+This proposal does not change existing semantics, except for those arguments
+that currently have ``__array_function__`` methods, which should be rare.
+
+
+Alternatives
+------------
+
+Specialized protocols
+~~~~~~~~~~~~~~~~~~~~~
+
+We could (and should) continue to develop protocols like
+``__array_ufunc__`` for cohesive subsets of Numpy functionality.
+
+As mentioned above, if this means that some functions that we overload
+with ``__array_function__`` should switch to a new protocol instead,
+that is explicitly OK for as long as ``__array_function__`` retains its
+experimental status.
+
+Separate namespace
+~~~~~~~~~~~~~~~~~~
+
+A separate namespace for overloaded functions is another possibility,
+either inside or outside of NumPy.
+
+This has the advantage of alleviating any possible concerns about
+backwards compatibility and would provide the maximum freedom for quick
+experimentation. In the long term, it would provide a clean abstration
+layer, separating NumPy's high level API from default implementations on
+``numpy.ndarray`` objects.
+
+The downsides are that this would require an explicit opt-in from all
+existing code, e.g., ``import numpy.api as np``, and in the long term
+would result in the maintainence of two separate NumPy APIs. Also, many
+functions from ``numpy`` itself are already overloaded (but
+inadequately), so confusion about high vs. low level APIs in NumPy would
+still persist.
+
+Multiple dispatch
+~~~~~~~~~~~~~~~~~
+
+An alternative to our suggestion of the ``__array_function__`` protocol
+would be implementing NumPy's core functions as
+`multi-methods <https://en.wikipedia.org/wiki/Multiple_dispatch>`_.
+Although one of us wrote a `multiple dispatch
+library <https://github.com/mrocklin/multipledispatch>`_ for Python, we
+don't think this approach makes sense for NumPy in the near term.
+
+The main reason is that NumPy already has a well-proven dispatching
+mechanism with ``__array_ufunc__``, based on Python's own dispatching
+system for arithemtic, and it would be confusing to add another
+mechanism that works in a very different way. This would also be more
+invasive change to NumPy itself, which would need to gain a multiple
+dispatch implementation.
+
+It is possible that multiple dispatch implementation for NumPy's high
+level API could make sense in the future. Fortunately,
+``__array_function__`` does not preclude this possibility, because it
+would be straightforward to write a shim for a default
+``__array_function__`` implementation in terms of multiple dispatch.
+
+Implementations in terms of a limited core API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The internal implemenations of some NumPy functions is extremely simple.
+For example: - ``np.stack()`` is implemented in only a few lines of code
+by combining indexing with ``np.newaxis``, ``np.concatenate`` and the
+``shape`` attribute. - ``np.mean()`` is implemented internally in terms
+of ``np.sum()``, ``np.divide()``, ``.astype()`` and ``.shape``.
+
+This suggests the possibility of defining a minimal "core" ndarray
+interface, and relying upon it internally in NumPy to implement the full
+API. This is an attractive option, because it could significantly reduce
+the work required for new array implementations.
+
+However, this also comes with several downsides: 1. The details of how
+NumPy implements a high-level function in terms of overloaded functions
+now becomes an implicit part of NumPy's public API. For example,
+refactoring ``stack`` to use ``np.block()`` instead of
+``np.concatenate()`` internally would now become a breaking change. 2.
+Array libraries may prefer to implement high level functions differently
+than NumPy. For example, a library might prefer to implement a
+fundamental operations like ``mean()`` directly rather than relying on
+``sum()`` followed by division. More generally, it's not clear yet what
+exactly qualifies as core functionality, and figuring this out could be
+a large project. 3. We don't yet have an overloading system for
+attributes and methods on array objects, e.g., for accessing ``.dtype``
+and ``.shape``. This should be the subject of a future NEP, but until
+then we should be reluctant to rely on these properties.
+
+Given these concerns, we encourage relying on this approach only in
+limited cases.
+
+Coersion to a NumPy array as a catch-all fallback
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With the current design, classes that implement ``__array_function__``
+to overload at least one function implicitly declare an intent to
+implement the entire NumPy API. It's not possible to implement *only*
+``np.concatenate()`` on a type, but fall back to NumPy's default
+behavior of casting with ``np.asarray()`` for all other functions.
+
+This could present a backwards compatibility concern that would
+discourage libraries from adopting ``__array_function__`` in an
+incremental fashion. For example, currently most numpy functions will
+implicitly convert ``pandas.Series`` objects into NumPy arrays, behavior
+that assuredly many pandas users rely on. If pandas implemented
+``__array_function__`` only for ``np.concatenate``, unrelated NumPy
+functions like ``np.nanmean`` would suddenly break on pandas objects by
+raising TypeError.
+
+With ``__array_ufunc__``, it's possible to alleviate this concern by
+casting all arguments to numpy arrays and re-calling the ufunc, but the
+heterogeneous function signatures supported by ``__array_function__``
+make it impossible to implement this generic fallback behavior for
+``__array_function__``.
+
+We could resolve this issue by change the handling of return values in
+``__array_function__`` in either of two possible ways: 1. Change the
+meaning of all arguments returning ``NotImplemented`` to indicate that
+all arguments should be coerced to NumPy arrays instead. However, many
+array libraries (e.g., scipy.sparse) really don't want implicit
+conversions to NumPy arrays, and often avoid implementing ``__array__``
+for exactly this reason. Implicit conversions can result in silent bugs
+and performance degradation. 2. Use another sentinel value of some sort
+to indicate that a class implementing part of the higher level array API
+is coercible as a fallback, e.g., a return value of
+``np.NotImplementedButCoercible`` from ``__array_function__``.
+
+If we take this second approach, we would need to define additional
+rules for how coercible array arguments are coerced, e.g., - Would we
+try for ``__array_function__`` overloads again after coercing coercible
+arguments? - If so, would we coerce coercible arguments one-at-a-time,
+or all-at-once?
+
+These are slightly tricky design questions, so for now we propose to
+defer this issue. We can always implement
+``np.NotImplementedButCoercible`` at some later time if it proves
+critical to the numpy community in the future. Importantly, we don't
+think this will stop critical libraries that desire to implement most of
+the high level NumPy API from adopting this proposal.
+
+NOTE: If you are reading this NEP in its draft state and disagree,
+please speak up on the mailing list!
+
+Drawbacks of this approach
+--------------------------
+
+Future difficulty extending NumPy's API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+One downside of passing on all arguments directly on to
+``__array_function__`` is that it makes it hard to extend the signatures
+of overloaded NumPy functions with new arguments, because adding even an
+optional keyword argument would break existing overloads.
+
+This is not a new problem for NumPy. NumPy has occasionally changed the
+signature for functions in the past, including functions like
+``numpy.sum`` which support overloads.
+
+For adding new keyword arguments that do not change default behavior, we
+would only include these as keyword arguments when they have changed
+from default values. This is similar to `what NumPy already has
+done <https://github.com/numpy/numpy/blob/v1.14.2/numpy/core/fromnumeric.py#L1865-L1867>`_,
+e.g., for the optional ``keepdims`` argument in ``sum``:
+
+.. code:: python
+
+    def sum(array, ..., keepdims=np._NoValue):
+        kwargs = {}
+        if keepdims is not np._NoValue:
+            kwargs['keepdims'] = keepdims
+        return array.sum(..., **kwargs)
+
+In other cases, such as deprecated arguments, preserving the existing
+behavior of overloaded functions may not be possible. Libraries that use
+``__array_function__`` should be aware of this risk: we don't propose to
+freeze NumPy's API in stone any more than it already is.
+
+Difficulty adding implementation specific arguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some array implementations generally follow NumPy's API, but have
+additional optional keyword arguments (e.g., ``dask.array.sum()`` has
+``split_every`` and ``tensorflow.reduce_sum()`` has ``name``). A generic
+dispatching library could potentially pass on all unrecognized keyword
+argument directly to the implementation, but extending ``np.sum()`` to
+pass on ``**kwargs`` would entail public facing changes in NumPy.
+Customizing the detailed behavior of array libraries will require using
+library specific functions, which could be limiting in the case of
+libraries that consume the NumPy API such as xarray.
+
+
+Discussion
+----------
+
+Various alternatives to this proposal were discussed in a few Github issues:
+
+1.  `pydata/sparse #1 <https://github.com/pydata/sparse/issues/1>`_
+2.  `numpy/numpy #11129 <https://github.com/numpy/numpy/issues/11129>`_
+
+Additionally it was the subject of `a blogpost
+<http://matthewrocklin.com/blog/work/2018/05/27/beyond-numpy>`_ Following this
+it was discussed at a `NumPy developer sprint
+<https://scisprints.github.io/#may-numpy-developer-sprint>`_ at the `UC
+Berkeley Institute for Data Science (BIDS) <https://bids.berkeley.edu/>`_.
+
+
+References and Footnotes
+------------------------
+
+.. [1] Each NEP must either be explicitly labeled as placed in the public domain (see
+   this NEP as an example) or licensed under the `Open Publication License`_.
+
+.. _Open Publication License: http://www.opencontent.org/openpub/
+
+
+Copyright
+---------
+
+This document has been placed in the public domain. [1]_
diff --git a/doc/neps/nep-0019-rng-policy.rst b/doc/neps/nep-0019-rng-policy.rst
new file mode 100644
index 000000000..de9164bba
--- /dev/null
+++ b/doc/neps/nep-0019-rng-policy.rst
@@ -0,0 +1,208 @@
+==============================
+Random Number Generator Policy
+==============================
+
+:Author: Robert Kern <robert.kern@gmail.com>
+:Status: Draft
+:Type: Standards Track
+:Created: 2018-05-24
+
+
+Abstract
+--------
+
+For the past decade, NumPy has had a strict backwards compatibility policy for
+the number stream of all of its random number distributions.  Unlike other
+numerical components in ``numpy``, which are usually allowed to return
+different when results when they are modified if they remain correct, we have
+obligated the random number distributions to always produce the exact same
+numbers in every version.  The objective of our stream-compatibility guarantee
+was to provide exact reproducibility for simulations across numpy versions in
+order to promote reproducible research.  However, this policy has made it very
+difficult to enhance any of the distributions with faster or more accurate
+algorithms.  After a decade of experience and improvements in the surrounding
+ecosystem of scientific software, we believe that there are now better ways to
+achieve these objectives.  We propose relaxing our strict stream-compatibility
+policy to remove the obstacles that are in the way of accepting contributions
+to our random number generation capabilities.
+
+
+The Status Quo
+--------------
+
+Our current policy, in full:
+
+    A fixed seed and a fixed series of calls to ``RandomState`` methods using the
+    same parameters will always produce the same results up to roundoff error
+    except when the values were incorrect.  Incorrect values will be fixed and
+    the NumPy version in which the fix was made will be noted in the relevant
+    docstring.  Extension of existing parameter ranges and the addition of new
+    parameters is allowed as long the previous behavior remains unchanged.
+
+This policy was first instated in Nov 2008 (in essence; the full set of weasel
+words grew over time) in response to a user wanting to be sure that the
+simulations that formed the basis of their scientific publication could be
+reproduced years later, exactly, with whatever version of ``numpy`` that was
+current at the time.  We were keen to support reproducible research, and it was
+still early in the life of ``numpy.random``.  We had not seen much cause to
+change the distribution methods all that much.
+
+We also had not thought very thoroughly about the limits of what we really
+could promise (and by “we” in this section, we really mean Robert Kern, let’s
+be honest).  Despite all of the weasel words, our policy overpromises
+compatibility.  The same version of ``numpy`` built on different platforms, or
+just in a different way could cause changes in the stream, with varying degrees
+of rarity.  The biggest is that the ``.multivariate_normal()`` method relies on
+``numpy.linalg`` functions.  Even on the same platform, if one links ``numpy``
+with a different LAPACK, ``.multivariate_normal()`` may well return completely
+different results.  More rarely, building on a different OS or CPU can cause
+differences in the stream.  We use C ``long`` integers internally for integer
+distribution (it seemed like a good idea at the time), and those can vary in
+size depending on the platform.  Distribution methods can overflow their
+internal C ``longs`` at different breakpoints depending on the platform and
+cause all of the random variate draws that follow to be different.
+
+And even if all of that is controlled, our policy still does not provide exact
+guarantees across versions.  We still do apply bug fixes when correctness is at
+stake.  And even if we didn’t do that, any nontrivial program does more than
+just draw random numbers.  They do computations on those numbers, transform
+those with numerical algorithms from the rest of ``numpy``, which is not
+subject to so strict a policy.  Trying to maintain stream-compatibility for our
+random number distributions does not help reproducible research for these
+reasons.
+
+The standard practice now for bit-for-bit reproducible research is to pin all
+of the versions of code of your software stack, possibly down to the OS itself.
+The landscape for accomplishing this is much easier today than it was in 2008.
+We now have ``pip``.  We now have virtual machines.  Those who need to
+reproduce simulations exactly now can (and ought to) do so by using the exact
+same version of ``numpy``.  We do not need to maintain stream-compatibility
+across ``numpy`` versions to help them.
+
+Our stream-compatibility guarantee has hindered our ability to make
+improvements to ``numpy.random``.  Several first-time contributors have
+submitted PRs to improve the distributions, usually by implementing a faster,
+or more accurate algorithm than the one that is currently there.
+Unfortunately, most of them would have required breaking the stream to do so.
+Blocked by our policy, and our inability to work around that policy, many of
+those contributors simply walked away.
+
+
+Implementation
+--------------
+
+We propose first freezing ``RandomState`` as it is and developing a new RNG
+subsystem alongside it.  This allows anyone who has been relying on our old
+stream-compatibility guarantee to have plenty of time to migrate.
+``RandomState`` will be considered deprecated, but with a long deprecation
+cycle, at least a few years.  Deprecation warnings will start silent but become
+increasingly noisy over time.  Bugs in the current state of the code will *not*
+be fixed if fixing them would impact the stream.  However, if changes in the
+rest of ``numpy`` would break something in the ``RandomState`` code, we will
+fix ``RandomState`` to continue working (for example, some change in the
+C API).  No new features will be added to ``RandomState``.  Users should
+migrate to the new subsystem as they are able to.
+
+Work on a proposed `new PRNG subsystem
+<https://github.com/bashtage/randomgen>`_ is already underway.  The specifics
+of the new design are out of scope for this NEP and up for much discussion, but
+we will discuss general policies that will guide the evolution of whatever code
+is adopted.
+
+First, we will maintain API source compatibility just as we do with the rest of
+``numpy``.  If we *must* make a breaking change, we will only do so with an
+appropriate deprecation period and warnings.
+
+Second, breaking stream-compatibility in order to introduce new features or
+improve performance will be *allowed* with *caution*.  Such changes will be
+considered features, and as such will be no faster than the standard release
+cadence of features (i.e. on ``X.Y`` releases, never ``X.Y.Z``).  Slowness is
+not a bug.  Correctness bug fixes that break stream-compatibility can happen on
+bugfix releases, per usual, but developers should consider if they can wait
+until the next feature release.  We encourage developers to strongly weight
+user’s pain from the break in stream-compatibility against the improvements.
+One example of a worthwhile improvement would be to change algorithms for
+a significant increase in performance, for example, moving from the `Box-Muller
+transform <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform>`_ method
+of Gaussian variate generation to the faster `Ziggurat algorithm
+<https://en.wikipedia.org/wiki/Ziggurat_algorithm>`_.  An example of an
+unworthy improvement would be tweaking the Ziggurat tables just a little bit.
+
+Any new design for the RNG subsystem will provide a choice of different core
+uniform PRNG algorithms.  We will be more strict about a select subset of
+methods on these core PRNG objects.  They MUST guarantee stream-compatibility
+for a minimal, specified set of methods which are chosen to make it easier to
+compose them to build other distributions.  Namely,
+
+    * ``.bytes()``
+    * ``.random_uintegers()``
+    * ``.random_sample()``
+
+Furthermore, the new design should also provide one generator class (we shall
+call it ``StableRandom`` for discussion purposes) that provides a slightly
+broader subset of distribution methods for which stream-compatibility is
+*guaranteed*.  The point of ``StableRandom`` is to provide something that can
+be used in unit tests so projects that currently have tests which rely on the
+precise stream can be migrated off of ``RandomState``.  For the best
+transition, ``StableRandom`` should use as its core uniform PRNG the current
+MT19937 algorithm.  As best as possible, the API for the distribution methods
+that are provided on ``StableRandom`` should match their counterparts on
+``RandomState``.  They should provide the same stream that the current version
+of ``RandomState`` does.  Because their intended use is for unit tests, we do
+not need the performance improvements from the new algorithms that will be
+introduced by the new subsystem.
+
+The list of ``StableRandom`` methods should be chosen to support unit tests:
+
+    * ``.randint()``
+    * ``.uniform()``
+    * ``.normal()``
+    * ``.standard_normal()``
+    * ``.choice()``
+    * ``.shuffle()``
+    * ``.permutation()``
+
+
+Not Versioning
+--------------
+
+For a long time, we considered that the way to allow algorithmic improvements
+while maintaining the stream was to apply some form of versioning.  That is,
+every time we make a stream change in one of the distributions, we increment
+some version number somewhere.  ``numpy.random`` would keep all past versions
+of the code, and there would be a way to get the old versions.  Proposals of
+how to do this exactly varied widely, but we will not exhaustively list them
+here.  We spent years going back and forth on these designs and were not able
+to find one that sufficed.  Let that time lost, and more importantly, the
+contributors that we lost while we dithered, serve as evidence against the
+notion.
+
+Concretely, adding in versioning makes maintenance of ``numpy.random``
+difficult.  Necessarily, we would be keeping lots of versions of the same code
+around.  Adding a new algorithm safely would still be quite hard.
+
+But most importantly, versioning is fundamentally difficult to *use* correctly.
+We want to make it easy and straightforward to get the latest, fastest, best
+versions of the distribution algorithms; otherwise, what's the point?  The way
+to make that easy is to make the latest the default.  But the default will
+necessarily change from release to release, so the user’s code would need to be
+altered anyway to specify the specific version that one wants to replicate.
+
+Adding in versioning to maintain stream-compatibility would still only provide
+the same level of stream-compatibility that we currently do, with all of the
+limitations described earlier.  Given that the standard practice for such needs
+is to pin the release of ``numpy`` as a whole, versioning ``RandomState`` alone
+is superfluous.
+
+
+Discussion
+----------
+
+- https://mail.python.org/pipermail/numpy-discussion/2018-January/077608.html
+- https://github.com/numpy/numpy/pull/10124#issuecomment-350876221
+
+
+Copyright
+---------
+
+This document has been placed in the public domain.
diff --git a/doc/neps/nep-template.rst b/doc/neps/nep-template.rst
index 56b06049e..26515127d 100644
--- a/doc/neps/nep-template.rst
+++ b/doc/neps/nep-template.rst
@@ -6,7 +6,7 @@ NEP Template and Instructions
 :Status: <Draft | Active | Accepted | Deferred | Rejected | Withdrawn | Final | Superseded>
 :Type: <Standards Track | Process>
 :Created: <date created on, in yyyy-mm-dd format>
-
+:Resolution: <url> (required for Accepted | Rejected | Withdrawn)
 
 Abstract
 --------
diff --git a/doc/release/1.14.3-notes.rst b/doc/release/1.14.3-notes.rst
new file mode 100644
index 000000000..60b631168
--- /dev/null
+++ b/doc/release/1.14.3-notes.rst
@@ -0,0 +1,41 @@
+==========================
+NumPy 1.14.3 Release Notes
+==========================
+
+This is a bugfix release for a few bugs reported following the 1.14.2 release:
+
+* np.lib.recfunctions.fromrecords accepts a list-of-lists, until 1.15
+* In python2, float types use the new print style when printing to a file
+* style arg in "legacy" print mode now works for 0d arrays
+
+The Python versions supported in this release are 2.7 and 3.4 - 3.6. The Python
+3.6 wheels available from PIP are built with Python 3.6.2 and should be
+compatible with all previous versions of Python 3.6. The source releases were
+cythonized with Cython 0.28.2.
+
+Contributors
+============
+
+A total of 6 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Allan Haldane
+* Charles Harris
+* Jonathan March +
+* Malcolm Smith +
+* Matti Picus
+* Pauli Virtanen
+
+Pull requests merged
+====================
+
+A total of 8 pull requests were merged for this release.
+
+* `#10862 <https://github.com/numpy/numpy/pull/10862>`__: BUG: floating types should override tp_print (1.14 backport)
+* `#10905 <https://github.com/numpy/numpy/pull/10905>`__: BUG: for 1.14 back-compat, accept list-of-lists in fromrecords
+* `#10947 <https://github.com/numpy/numpy/pull/10947>`__: BUG: 'style' arg to array2string broken in legacy mode (1.14...
+* `#10959 <https://github.com/numpy/numpy/pull/10959>`__: BUG: test, fix for missing flags['WRITEBACKIFCOPY'] key
+* `#10960 <https://github.com/numpy/numpy/pull/10960>`__: BUG: Add missing underscore to prototype in check_embedded_lapack
+* `#10961 <https://github.com/numpy/numpy/pull/10961>`__: BUG: Fix encoding regression in ma/bench.py (Issue #10868)
+* `#10962 <https://github.com/numpy/numpy/pull/10962>`__: BUG: core: fix NPY_TITLE_KEY macro on pypy
+* `#10974 <https://github.com/numpy/numpy/pull/10974>`__: BUG: test, fix PyArray_DiscardWritebackIfCopy...
diff --git a/doc/release/1.14.4-notes.rst b/doc/release/1.14.4-notes.rst
new file mode 100644
index 000000000..174094c1c
--- /dev/null
+++ b/doc/release/1.14.4-notes.rst
@@ -0,0 +1,60 @@
+==========================
+NumPy 1.14.4 Release Notes
+==========================
+
+This is a bugfix release for bugs reported following the 1.14.3 release. The
+most significant fixes are:
+
+* fixes for compiler instruction reordering that resulted in NaN's not being
+  properly propagated in `np.max` and `np.min`,
+
+* fixes for bus faults on SPARC and older ARM due to incorrect alignment
+  checks.
+
+There are also improvements to printing of long doubles on PPC platforms. All
+is not yet perfect on that platform, the whitespace padding is still incorrect
+and is to be fixed in numpy 1.15, consequently NumPy still fails some
+printing-related (and other) unit tests on ppc systems. However, the printed
+values are now correct.
+
+Note that NumPy will error on import if it detects incorrect float32 `dot`
+results. This problem has been seen on the Mac when working in the Anaconda
+enviroment and is due to a subtle interaction between MKL and PyQt5.  It is not
+strictly a NumPy problem, but it is best that users be aware of it.  See the
+gh-8577 NumPy issue for more information.
+
+The Python versions supported in this release are 2.7 and 3.4 - 3.6. The Python
+3.6 wheels available from PIP are built with Python 3.6.2 and should be
+compatible with all previous versions of Python 3.6. The source releases were
+cythonized with Cython 0.28.2 and should work for the upcoming Python 3.7.
+
+Contributors
+============
+
+A total of 7 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Allan Haldane
+* Charles Harris
+* Marten van Kerkwijk
+* Matti Picus
+* Pauli Virtanen
+* Ryan Soklaski +
+* Sebastian Berg
+
+Pull requests merged
+====================
+
+A total of 11 pull requests were merged for this release.
+
+* `#11104 <https://github.com/numpy/numpy/pull/11104>`__: BUG: str of DOUBLE_DOUBLE format wrong on ppc64
+* `#11170 <https://github.com/numpy/numpy/pull/11170>`__: TST: linalg: add regression test for gh-8577
+* `#11174 <https://github.com/numpy/numpy/pull/11174>`__: MAINT: add sanity-checks to be run at import time
+* `#11181 <https://github.com/numpy/numpy/pull/11181>`__: BUG: void dtype setup checked offset not actual pointer for alignment
+* `#11194 <https://github.com/numpy/numpy/pull/11194>`__: BUG: Python2 doubles don't print correctly in interactive shell.
+* `#11198 <https://github.com/numpy/numpy/pull/11198>`__: BUG: optimizing compilers can reorder call to npy_get_floatstatus
+* `#11199 <https://github.com/numpy/numpy/pull/11199>`__: BUG: reduce using SSE only warns if inside SSE loop
+* `#11203 <https://github.com/numpy/numpy/pull/11203>`__: BUG: Bytes delimiter/comments in genfromtxt should be decoded
+* `#11211 <https://github.com/numpy/numpy/pull/11211>`__: BUG: Fix reference count/memory leak exposed by better testing
+* `#11219 <https://github.com/numpy/numpy/pull/11219>`__: BUG: Fixes einsum broadcasting bug when optimize=True
+* `#11251 <https://github.com/numpy/numpy/pull/11251>`__: DOC: Document 1.14.4 release.
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
index 3ea51dca8..a269e25f1 100644
--- a/doc/release/1.15.0-notes.rst
+++ b/doc/release/1.15.0-notes.rst
@@ -20,15 +20,18 @@ New functions
 * ``nanquantile`` function, an interface to ``nanpercentile`` without factors
   of 100
 
-* `np.printoptions`, the context manager which sets print options temporarily
+* `np.printoptions`, a context manager that sets print options temporarily
   for the scope of the ``with`` block::
 
     >>> with np.printoptions(precision=2):
     ...     print(np.array([2.0]) / 3)
     [0.67]
 
-  * `np.histogram_bin_edges`, a function to get the edges of the bins used by a histogram
-    without needing to calculate the histogram.
+* `np.histogram_bin_edges`, a function to get the edges of the bins used by a histogram
+  without needing to calculate the histogram.
+
+* `npy_get_floatstatus_barrier`` and ``npy_clear_floatstatus_barrier`` have been added to
+  deal with compiler optimization changing the order of operations. See below for details.
 
 Deprecations
 ============
@@ -42,6 +45,14 @@ Deprecations
   * `np.ma.loads`, `np.ma.dumps`
   * `np.ma.load`, `np.ma.dump` - these functions already failed on python 3,
     when called with a string.
+
+* Multidimensional indexing with anything but a tuple is
+  deprecated. This means that code such as ``ind = [slice(None), 0]``,
+  ``arr[[slice(None), 0]]`` should be changed to ``arr[tuple(ind)]``. This is
+  necessary to avoid ambiguity in expressions such as ``arr[[[0, 1], [0, 1]]]``
+  which currently is interpreted as ``arr[array([0, 1]), array([0, 1])]``.
+  In future, this will be interpreted as ``arr[array([[0, 1], [0, 1]])]``.
+
 * Direct imports from the following modules is deprecated. All testing related
   imports should come from `numpy.testing`.
   * `np.testing.utils`
@@ -55,7 +66,7 @@ Deprecations
   In the future, it might return a different result. Use `np.sum(np.from_iter(generator))`
   or the built-in Python `sum` instead.
 
-* Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or 
+* Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or
   ``PyArray_DiscardWritbackIfCopy`` on any array with the ``WRITEBACKIFCOPY``
   flag set, before the array is deallocated. A deprecation warning will be
   emitted if those calls are not used when needed.
@@ -64,7 +75,7 @@ Deprecations
   anytime one of the iterator operands is writeable, so that numpy can
   manage writeback semantics, or should call ``it.close()``. A
  `RuntimeWarning` will be emitted otherwise in these cases. Users of the C-API
-  should call ``NpyIter_Close`` before ``NpyIter_Dealloc``.
+  should call ``NpyIter_Close`` before ``NpyIter_Deallocate``.
 
 
 Future Changes
@@ -74,6 +85,16 @@ Future Changes
 Compatibility notes
 ===================
 
+The ``NpzFile`` returned by ``np.savez`` is now a `collections.abc.Mapping`
+---------------------------------------------------------------------------
+This means it behaves like a readonly dictionary, and has a new ``.values()``
+method and ``len()`` implementation.
+
+On python 3, this means that ``.iteritems()``, ``.iterkeys()`` have been
+deprecated, and ``.keys()`` and ``.items()`` now return views and not lists.
+This is consistent with how the builtin ``dict`` type changed between python 2
+and python 3.
+
 Under certain conditions, nditer must be used in a context manager
 ------------------------------------------------------------------
 When using an nditer with the ``"writeonly"`` or ``"readwrite"`` flags, there
@@ -81,7 +102,7 @@ are some circumstances where nditer doesn't actually give you a view onto the
 writable array. Instead, it gives you a copy, and if you make changes to the
 copy, nditer later writes those changes back into your actual array. Currently,
 this writeback occurs when the array objects are garbage collected, which makes
-this API error-prone on CPython and entirely broken on PyPy. Therefore, 
+this API error-prone on CPython and entirely broken on PyPy. Therefore,
 ``nditer`` should now be used as a context manager whenever using ``nditer``
 with writeable arrays (``with np.nditer(...) as it: ...``). You may also
 explicitly call ``it.close()`` for cases where a context manager is unusable,
@@ -121,12 +142,37 @@ longer possible, and objects expecting the old API are respected. The silent suc
 by removing the interception of an otherwise-normal Exception when ``axis`` was provided to an object
 using the old API.
 
+unstructured void array's ``.item`` method now returns a bytes object
+---------------------------------------------------------------------
+``.item`` now returns a ``bytes`` object instead of a buffer or byte array.
+This may affect code which assumed the return value was mutable, which is no
+longer the case.
+
+``copy.copy`` and ``copy.deepcopy`` no longer turn ``masked`` into an array
+----------------------------------------------------------------------------
+Since ``np.ma.masked`` is a readonly scalar, copying should be a no-op. These
+functions now behave consistently with ``np.copy()``.
+
 
 C API changes
 =============
 
-``NpyIter_Close`` has been added and should be called before
-``NpyIter_Dealloc`` to resolve possible writeback-enabled arrays.
+* ``NpyIter_Close`` has been added and should be called before
+  ``NpyIter_Deallocate`` to resolve possible writeback-enabled arrays.
+
+* Functions ``npy_get_floatstatus_barrier`` and ``npy_clear_floatstatus_barrier``
+  have been added and should be used in place of the ``npy_get_floatstatus``and
+  ``npy_clear_status`` functions. Optimizing compilers like GCC 8.1 and Clang
+  were rearranging the order of operations when the previous functions were
+  used in the ufunc SIMD functions, resulting in the floatstatus flags being '
+  checked before the operation whose status we wanted to check was run.
+  See `#10339 <https://github.com/numpy/numpy/issues/10370>`__.
+
+* ``PyArray_GetDTypeTransferFunction`` now defaults to using user-defined
+  ``copyswapn`` / ``copyswap`` for user-defined dtypes. If this causes a
+  significant performance hit, consider implementing ``copyswapn`` to reflect
+  the implementation of ``PyArray_GetStridedCopyFn``.
+  See `#10898 <https://github.com/numpy/numpy/pull/10898>`__.
 
 New Features
 ============
@@ -149,6 +195,11 @@ Creating a full iOS-compatible NumPy package requires building for the 5
 architectures supported by iOS (i386, x86_64, armv7, armv7s and arm64), and
 combining these 5 compiled builds products into a single "fat" binary.
 
+``return_indices`` keyword added for ``np.intersect1d``
+-------------------------------------------------------
+New keyword ``return_indices`` returns the indices of the two input arrays
+that correspond to the common elements.
+
 ``np.quantile`` and ``np.nanquantile``
 --------------------------------------
 Like ``np.percentile`` and ``np.nanpercentile``, but takes quantiles in [0, 1]
@@ -160,9 +211,16 @@ Build system
 ------------
 Added experimental support for the 64-bit RISC-V architecture.
 
+
 Improvements
 ============
 
+``np.ufunc.reduce`` and related functions now accept an initial value
+---------------------------------------------------------------------
+``np.ufunc.reduce``, ``np.sum``, ``np.prod``, ``np.min`` and ``np.max`` all
+now accept an ``initial`` keyword argument that specifies the value to start
+the reduction with.
+
 ``np.flip`` can operate over multiple axes
 ------------------------------------------
 ``np.flip`` now accepts None, or tuples of int, in its ``axis`` argument. If
@@ -197,6 +255,13 @@ passed explicitly, and are not yet computed automatically.
 No longer does an IQR of 0 result in `n_bins=1`, rather the number of bins
 chosen is related to the data size in this situation.
 
+``histogram`` and ``histogramdd`` return edges matching the float type of the data
+----------------------------------------------------------------------------------
+When passed ``float16``, ``np.float32``, or ``np.longdouble`` data, the
+returned edges are now of the same dtype. Previously, ``histogram`` would only
+return the same type if explicit bins were given, and ``histogram`` would
+produce ``float64`` bins no matter what the inputs.
+
 ``histogramdd`` allows explicit ranges to be given in a subset of axes
 ----------------------------------------------------------------------
 The ``range`` argument of `histogramdd` can now contain ``None`` values to
@@ -286,6 +351,81 @@ of the overlap between input an output, that is, the next element accumulated
 is added before the accumulated result is stored in its place, hence the
 overlap is safe. Avoiding the copy results in faster execution.
 
+``linalg.matrix_power`` can now handle stacks of matrices
+---------------------------------------------------------
+Like other functions in ``linalg``, ``matrix_power`` can now deal with arrays
+of dimension larger than 2, which are treated as stacks of matrices. As part
+of the change, to further improve consistency, the name of the first argument
+has been changed to ``a`` (from ``M``), and the exceptions for non-square
+matrices have been changed to ``LinAlgError`` (from ``ValueError``).
+
+Increased performance in ``random.permutation`` for multidimensional arrays
+---------------------------------------------------------------------------
+``permutation`` uses the fast path in ``random.shuffle`` for all input
+array dimensions.  Previously the fast path was only used for 1-d arrays.
+
+Generalized ufuncs now accept ``axes``, ``axis`` and ``keepdims`` arguments
+---------------------------------------------------------------------------
+One can control over which axes a generalized ufunc operates by passing in an
+``axes`` argument, a list of tuples with indices of particular axes.  For
+instance, for a signature of ``(i,j),(j,k)->(i,k)`` appropriate for matrix
+multiplication, the base elements are two-dimensional matrices and these are
+taken to be stored in the two last axes of each argument.  The corresponding
+axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``. If one wanted to
+use leading dimensions instead, one would pass in ``[(0, 1), (0, 1), (0, 1)]``.
+
+For simplicity, for generalized ufuncs that operate on 1-dimensional arrays
+(vectors), a single integer is accepted instead of a single-element tuple, and
+for generalized ufuncs for which all outputs are scalars, the (empty) output
+tuples can be omitted.  Hence, for a signature of ``(i),(i)->()`` appropriate
+for an inner product, one could pass in ``axes=[0, 0]`` to indicate that the
+vectors are stored in the first dimensions of the two inputs arguments.
+
+As a short-cut for generalized ufuncs that are similar to reductions, i.e.,
+that act on a single, shared core dimension such as the inner product example
+above, one can pass an ``axis`` argument. This is equivalent to passing in
+``axes`` with identical entries for all arguments with that core dimension
+(e.g., for the example above, ``axes=[(axis,), (axis,)]``).
+
+Furthermore, like for reductions, for generalized ufuncs that have inputs that
+all have the same number of core dimensions and outputs with no core dimension,
+one can pass in ``keepdims`` to leave a dimension with size 1 in the outputs,
+thus allowing proper broadcasting against the original inputs. The location of
+the extra dimension can be controlled with ``axes``. For instance, for the
+inner-product example, ``keepdims=True, axes=[-2, -2, -2]`` would act on the
+inner-product example, ``keepdims=True, axis=-2`` would act on the
+one-but-last dimension of the input arguments, and leave a size 1 dimension in
+that place in the output.
+
+float128 values now print correctly on ppc systems
+--------------------------------------------------
+Previously printing float128 values was buggy on ppc, since the special
+double-double floating-point-format on these systems was not accounted for.
+float128s now print with correct rounding and uniqueness.
+
+Warning to ppc users: You should upgrade glibc if it is version <=2.23,
+especially if using float128. On ppc, glibc's malloc in these version often
+misaligns allocated memory which can crash numpy when using float128 values.
+
+New ``np.take_along_axis`` and ``np.put_along_axis`` functions
+--------------------------------------------------------------
+When used on multidimensional arrays, ``argsort``, ``argmin``, ``argmax``, and
+``argpartition`` return arrays that are difficult to use as indices.
+``take_along_axis`` provides an easy way to use these indices to lookup values
+within an array, so that::
+
+    np.take_along_axis(a, np.argsort(a, axis=axis), axis=axis)
+
+is the same as::
+
+    np.sort(a, axis=axis)
+
+``np.put_along_axis`` acts as the dual operation for writing to these indices
+within an array.
+
+.. note:: Implementations of ``__array_ufunc__`` should ensure that they can
+          handle either ``axis`` or ``axes``.  In future, we may convert
+          ``axis`` to ``axes`` before passing it on.
 
 Changes
 =======
diff --git a/doc/scipy-sphinx-theme b/doc/scipy-sphinx-theme
-Subproject c466764e2231ba132c09826b5b138fffa1cfcec
+Subproject d990ab9134199f6496b9ac8567f10791f04a720
diff --git a/doc/source/_templates/autosummary/minimal_module.rst b/doc/source/_templates/autosummary/minimal_module.rst
new file mode 100644
index 000000000..f0d9f00b2
--- /dev/null
+++ b/doc/source/_templates/autosummary/minimal_module.rst
@@ -0,0 +1,8 @@
+{{ fullname | escape | underline}}
+
+.. automodule:: {{ fullname }}
+
+   {% block docstring %}
+   {% endblock %}
+
+
diff --git a/doc/source/about.rst b/doc/source/about.rst
index 24dc3d0a0..776488ea4 100644
--- a/doc/source/about.rst
+++ b/doc/source/about.rst
@@ -1,7 +1,7 @@
 About NumPy
 ===========
 
-`NumPy <http://www.scipy.org/NumpPy/>`__ is the fundamental package
+NumPy is the fundamental package
 needed for scientific computing with Python. This package contains:
 
 - a powerful N-dimensional :ref:`array object <arrays>`
@@ -42,6 +42,8 @@ Our main means of communication are:
 
 More information about the development of NumPy can be found at our `Developer Zone <https://scipy.scipy.org/scipylib/dev-zone.html>`__.
 
+The project management structure can be found at our :doc:`governance page <dev/governance/index>`
+
 
 About this documentation
 ========================
diff --git a/doc/source/dev/gitwash/development_workflow.rst b/doc/source/dev/gitwash/development_workflow.rst
index 5476e3202..c6884a7cf 100644
--- a/doc/source/dev/gitwash/development_workflow.rst
+++ b/doc/source/dev/gitwash/development_workflow.rst
@@ -396,7 +396,7 @@ collaborator:
 
 Now all those people can do::
 
-    git clone git@githhub.com:your-user-name/numpy.git
+    git clone git@github.com:your-user-name/numpy.git
 
 Remember that links starting with ``git@`` use the ssh protocol and are
 read-write; links starting with ``git://`` are read-only.
diff --git a/doc/source/reference/arrays.classes.rst b/doc/source/reference/arrays.classes.rst
index 2719f9239..f17cb932a 100644
--- a/doc/source/reference/arrays.classes.rst
+++ b/doc/source/reference/arrays.classes.rst
@@ -215,6 +215,13 @@ Matrix objects
 .. index::
    single: matrix
 
+.. note::
+   It is strongly advised *not* to use the matrix subclass.  As described
+   below, it makes writing functions that deal consistently with matrices
+   and regular arrays very difficult. Currently, they are mainly used for
+   interacting with ``scipy.sparse``. We hope to provide an alternative
+   for this use, however, and eventually remove the ``matrix`` subclass.
+
 :class:`matrix` objects inherit from the ndarray and therefore, they
 have the same attributes and methods of ndarrays. There are six
 important differences of matrix objects, however, that may lead to
diff --git a/doc/source/reference/arrays.indexing.rst b/doc/source/reference/arrays.indexing.rst
index b5a44c22a..ba1bfd312 100644
--- a/doc/source/reference/arrays.indexing.rst
+++ b/doc/source/reference/arrays.indexing.rst
@@ -29,11 +29,15 @@ dimensions. Basic slicing occurs when *obj* is a :class:`slice` object
 (constructed by ``start:stop:step`` notation inside of brackets), an
 integer, or a tuple of slice objects and integers. :const:`Ellipsis`
 and :const:`newaxis` objects can be interspersed with these as
-well. In order to remain backward compatible with a common usage in
-Numeric, basic slicing is also initiated if the selection object is
-any non-ndarray sequence (such as a :class:`list`) containing :class:`slice`
-objects, the :const:`Ellipsis` object, or the :const:`newaxis` object,
-but not for integer arrays or other embedded sequences.
+well.
+
+.. deprecated:: 1.15.0
+
+  In order to remain backward compatible with a common usage in
+  Numeric, basic slicing is also initiated if the selection object is
+  any non-ndarray and non-tuple sequence (such as a :class:`list`) containing
+  :class:`slice` objects, the :const:`Ellipsis` object, or the :const:`newaxis`
+  object, but not for integer arrays or other embedded sequences.
 
 .. index::
    triple: ndarray; special methods; getitem
@@ -196,7 +200,8 @@ basic slicing that returns a :term:`view`).
    why this occurs.
 
    Also recognize that ``x[[1,2,3]]`` will trigger advanced indexing,
-   whereas ``x[[1,2,slice(None)]]`` will trigger basic slicing.
+   whereas due to the deprecated Numeric compatibility mentioned above,
+   ``x[[1,2,slice(None)]]`` will trigger basic slicing.
 
 Integer array indexing
 ^^^^^^^^^^^^^^^^^^^^^^
diff --git a/doc/source/reference/arrays.nditer.rst b/doc/source/reference/arrays.nditer.rst
index acad29b11..239f4296b 100644
--- a/doc/source/reference/arrays.nditer.rst
+++ b/doc/source/reference/arrays.nditer.rst
@@ -78,27 +78,28 @@ order='C' for C order and order='F' for Fortran order.
     ...
     0 3 1 4 2 5
 
+.. _nditer-context-manager:
+
 Modifying Array Values
 ----------------------
 
-By default, the :class:`nditer` treats the input array as a read-only
-object. To modify the array elements, you must specify either read-write
-or write-only mode. This is controlled with per-operand flags. The
-operands may be created as views into the original data with the 
-`WRITEBACKIFCOPY` flag. In this case the iterator must either
-
-- be used as a context manager, and the temporary data will be written back
-  to the original array when the `__exit__` function is called.
-- have a call to the iterator's `close` function to ensure the modified data
-  is written back to the original array.
-
-Regular assignment in Python simply changes a reference in the local or
-global variable dictionary instead of modifying an existing variable in
-place.  This means that simply assigning to `x` will not place the value
-into the element of the array, but rather switch `x` from being an array
-element reference to being a reference to the value you assigned. To
-actually modify the element of the array, `x` should be indexed with
-the ellipsis.
+By default, the :class:`nditer` treats the input operand as a read-only
+object. To be able to modify the array elements, you must specify either
+read-write or write-only mode using the `'readwrite'` or `'writeonly'`
+per-operand flags.
+
+The nditer will then yield writeable buffer arrays which you may modify. However,
+because  the nditer must copy this buffer data back to the original array once
+iteration is finished, you must signal when the iteration is ended, by one of two
+methods. You may either:
+
+ - used the nditer as a context manager using the `with` statement, and
+   the temporary data will be written back when the context is exited.
+ - call the iterator's `close` method once finished iterating, which will trigger
+   the write-back.
+
+The nditer can no longer be iterated once either `close` is called or its
+context is exited.
 
 .. admonition:: Example
 
@@ -186,7 +187,7 @@ construct in order to be more readable.
     0 <(0, 0)> 1 <(0, 1)> 2 <(0, 2)> 3 <(1, 0)> 4 <(1, 1)> 5 <(1, 2)>
 
     >>> it = np.nditer(a, flags=['multi_index'], op_flags=['writeonly'])
-    >>> with it: 
+    >>> with it:
     ....    while not it.finished:
     ...         it[0] = it.multi_index[1] - it.multi_index[0]
     ...         it.iternext()
diff --git a/doc/source/reference/c-api.coremath.rst b/doc/source/reference/c-api.coremath.rst
index d3f7fcf75..ad92235da 100644
--- a/doc/source/reference/c-api.coremath.rst
+++ b/doc/source/reference/c-api.coremath.rst
@@ -183,14 +183,46 @@ Those can be useful for precise floating point comparison.
     * NPY_FPE_UNDERFLOW
     * NPY_FPE_INVALID
 
+    Note that :c:func:`npy_get_floatstatus_barrier` is preferable as it prevents
+    agressive compiler optimizations reordering the call relative to
+    the code setting the status, which could lead to incorrect results.
+
     .. versionadded:: 1.9.0
 
+.. c:function:: int npy_get_floatstatus_barrier(char*)
+
+    Get floating point status. A pointer to a local variable is passed in to
+    prevent aggresive compiler optimizations from reodering this function call
+    relative to the code setting the status, which could lead to incorrect
+    results.
+
+    Returns a bitmask with following possible flags:
+
+    * NPY_FPE_DIVIDEBYZERO
+    * NPY_FPE_OVERFLOW
+    * NPY_FPE_UNDERFLOW
+    * NPY_FPE_INVALID
+
+    .. versionadded:: 1.15.0
+
 .. c:function:: int npy_clear_floatstatus()
 
     Clears the floating point status. Returns the previous status mask.
 
+    Note that :c:func:`npy_clear_floatstatus_barrier` is preferable as it
+    prevents agressive compiler optimizations reordering the call relative to
+    the code setting the status, which could lead to incorrect results.
+
     .. versionadded:: 1.9.0
 
+.. c:function:: int npy_clear_floatstatus_barrier(char*)
+
+    Clears the floating point status. A pointer to a local variable is passed in to
+    prevent aggresive compiler optimizations from reodering this function call.
+    Returns the previous status mask.
+
+    .. versionadded:: 1.15.0
+n
 Complex functions
 ~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/reference/c-api.generalized-ufuncs.rst b/doc/source/reference/c-api.generalized-ufuncs.rst
index a53228cb5..dd8cf6558 100644
--- a/doc/source/reference/c-api.generalized-ufuncs.rst
+++ b/doc/source/reference/c-api.generalized-ufuncs.rst
@@ -101,6 +101,7 @@ Dimension Index
     enumerates the dimension names according to the order of the first
     occurrence of each name in the signature.
 
+.. _details-of-signature:
 
 Details of Signature
 --------------------
@@ -126,9 +127,9 @@ The formal syntax of signatures is as follows::
     <Output arguments>     ::= <Argument list>
     <Argument list>        ::= nil | <Argument> | <Argument> "," <Argument list>
     <Argument>             ::= "(" <Core dimension list> ")"
-    <Core dimension list>  ::= nil | <Dimension name> |
-                               <Dimension name> "," <Core dimension list>
-    <Dimension name>       ::= valid Python variable name
+    <Core dimension list>  ::= nil | <Core dimension name> |
+                               <Core dimension name> "," <Core dimension list>
+    <Core dimension name>  ::= valid Python variable name
 
 
 Notes:
diff --git a/doc/source/reference/c-api.iterator.rst b/doc/source/reference/c-api.iterator.rst
index 17f1c45f2..392dcb730 100644
--- a/doc/source/reference/c-api.iterator.rst
+++ b/doc/source/reference/c-api.iterator.rst
@@ -709,6 +709,10 @@ Construction and Destruction
     the functions will pass back errors through it instead of setting
     a Python exception.
 
+    :c:func:`NpyIter_Deallocate` must be called for each copy. One call to
+    :c:func:`NpyIter_Close` is sufficient to trigger writeback resolution for
+    all copies since they share buffers.
+
 .. c:function:: int NpyIter_RemoveAxis(NpyIter* iter, int axis)``
 
     Removes an axis from iteration.  This requires that
@@ -761,8 +765,10 @@ Construction and Destruction
 
 .. c:function:: int NpyIter_Close(NpyIter* iter)
 
-    Resolves any needed writeback resolution. Must be called before
-    ``NpyIter_Deallocate``. After this call it is not safe to use the operands.
+    Resolves any needed writeback resolution. Should be called before
+    :c:func::`NpyIter_Deallocate`. After this call it is not safe to use the operands.
+    When using :c:func:`NpyIter_Copy`, only one call to :c:func:`NpyIter_Close`
+    is sufficient to resolve any writebacks, since the copies share buffers.
 
     Returns ``0`` or ``-1`` if unsuccessful.
 
@@ -770,8 +776,8 @@ Construction and Destruction
 
     Deallocates the iterator object.  
 
-    `NpyIter_Close` should be called before this. If not, and if writeback is
-    needed, it will be performed at this point in order to maintain
+    :c:func:`NpyIter_Close` should be called before this. If not, and if
+    writeback is needed, it will be performed at this point in order to maintain
     backward-compatibility with older code, and a deprecation warning will be
     emitted. Old code should be updated to call `NpyIter_Close` beforehand.
 
diff --git a/doc/source/reference/c-api.ufunc.rst b/doc/source/reference/c-api.ufunc.rst
index 79ad256f5..02a35cf56 100644
--- a/doc/source/reference/c-api.ufunc.rst
+++ b/doc/source/reference/c-api.ufunc.rst
@@ -93,12 +93,23 @@ Functions
         the corresponding 1-d loop function in the func array.
 
     :param types:
-        Must be of length (*nin* + *nout*) \* *ntypes*, and it
-        contains the data-types (built-in only) that the corresponding
-        function in the *func* array can deal with.
+       Length ``(nin + nout) * ntypes`` array of ``char`` encoding the
+       :ref:`PyArray_Descr.type_num` (built-in only) that the corresponding
+       function in the ``func`` array accepts. For instance, for a comparison
+       ufunc with three ``ntypes``, two ``nin`` and one ``nout``, where the
+       first function accepts :ref:`npy_int32` and the the second
+       :ref:`npy_int64`, with both returning :ref:`npy_bool`, ``types`` would
+       be ``(char[]) {5, 5, 0, 7, 7, 0}`` since ``NPY_INT32`` is 5,
+       ``NPY_INT64`` is 7, and ``NPY_BOOL`` is 0 (on the python side, these
+       are exposed via :ref:`dtype.num`, i.e., for the example here,
+       ``dtype(np.int32).num``, ``dtype(np.int64).num``, and
+       ``dtype(np.bool_).num``, resp.).
+
+        :ref:`casting-rules` will be used at runtime to find the first
+        ``func`` callable by the input/output provided.
 
     :param ntypes:
-        How many different data-type "signatures" the ufunc has implemented.
+        How many different data-type-specific functions the ufunc has implemented.
 
     :param nin:
         The number of inputs to this operation.
@@ -129,10 +140,11 @@ Functions
         int nin, int nout, int identity, char* name, char* doc, int unused, char *signature)
 
    This function is very similar to PyUFunc_FromFuncAndData above, but has
-   an extra *signature* argument, to define generalized universal functions.
+   an extra *signature* argument, to define a
+   :ref:`generalized universal functions <c-api.generalized-ufuncs>`.
    Similarly to how ufuncs are built around an element-by-element operation,
-   gufuncs are around subarray-by-subarray operations, the signature defining
-   the subarrays to operate on.
+   gufuncs are around subarray-by-subarray operations, the
+   :ref:`signature <details-of-signature>` defining the subarrays to operate on.
 
    :param signature:
         The signature for the new gufunc. Setting it to NULL is equivalent
diff --git a/doc/source/reference/routines.indexing.rst b/doc/source/reference/routines.indexing.rst
index 4d2458d2f..aeec1a1bb 100644
--- a/doc/source/reference/routines.indexing.rst
+++ b/doc/source/reference/routines.indexing.rst
@@ -36,6 +36,7 @@ Indexing-like operations
    :toctree: generated/
 
    take
+   take_along_axis
    choose
    compress
    diag
@@ -50,6 +51,7 @@ Inserting data into arrays
 
    place
    put
+   put_along_axis
    putmask
    fill_diagonal
 
diff --git a/doc/source/reference/routines.io.rst b/doc/source/reference/routines.io.rst
index 573498792..55489951f 100644
--- a/doc/source/reference/routines.io.rst
+++ b/doc/source/reference/routines.io.rst
@@ -14,7 +14,7 @@ NumPy binary files (NPY, NPZ)
    savez_compressed
 
 The format of these binary file types is documented in
-http://numpy.github.io/neps/npy-format.html
+:py:mod:`numpy.lib.format`
 
 Text files
 ----------
@@ -78,3 +78,11 @@ Data sources
    :toctree: generated/
 
    DataSource
+
+Binary Format Description
+-------------------------
+.. autosummary::
+   :template: autosummary/minimal_module.rst
+   :toctree: generated/
+
+    lib.format
diff --git a/doc/source/reference/swig.testing.rst b/doc/source/reference/swig.testing.rst
index 13642a52e..594df952e 100644
--- a/doc/source/reference/swig.testing.rst
+++ b/doc/source/reference/swig.testing.rst
@@ -22,7 +22,7 @@ typemaps are working as expected.
 Testing Organization
 --------------------
 
-There are three indepedent testing frameworks supported, for one-,
+There are three independent testing frameworks supported, for one-,
 two-, and three-dimensional arrays respectively.  For one-dimensional
 arrays, there are two C++ files, a header and a source, named::
 
diff --git a/doc/source/reference/ufuncs.rst b/doc/source/reference/ufuncs.rst
index 59d25a9ca..3cc956887 100644
--- a/doc/source/reference/ufuncs.rst
+++ b/doc/source/reference/ufuncs.rst
@@ -327,6 +327,13 @@ advanced usage and will not typically be used.
     multiple outputs is deprecated, and will raise a warning in numpy 1.10,
     and an error in a future release.
 
+    If 'out' is None (the default), a uninitialized return array is created.
+    The output array is then filled with the results of the ufunc in the places
+    that the broadcast 'where' is True. If 'where' is the scalar True (the
+    default), then this corresponds to the entire output being filled.
+    Note that outputs not explicitly filled are left with their
+    uninitialized values.
+
 *where*
 
     .. versionadded:: 1.7
@@ -336,6 +343,9 @@ advanced usage and will not typically be used.
     of False indicate to leave the value in the output alone. This argument
     cannot be used for generalized ufuncs as those take non-scalar input.
 
+    Note that if an uninitialized return array is created, values of False
+    will leave those values **uninitialized**.
+
 *axes*
 
     .. versionadded:: 1.15
@@ -350,6 +360,29 @@ advanced usage and will not typically be used.
     and for generalized ufuncs for which all outputs are scalars, the output
     tuples can be omitted.
 
+*axis*
+
+    .. versionadded:: 1.15
+
+    A single axis over which a generalized ufunc should operate. This is a
+    short-cut for ufuncs that operate over a single, shared core dimension,
+    equivalent to passing in ``axes`` with entries of ``(axis,)`` for each
+    single-core-dimension argument and ``()`` for all others.  For instance,
+    for a signature ``(i),(i)->()``, it is equivalent to passing in
+    ``axes=[(axis,), (axis,), ()]``.
+
+*keepdims*
+
+    .. versionadded:: 1.15
+
+    If this is set to `True`, axes which are reduced over will be left in the
+    result as a dimension with size one, so that the result will broadcast
+    correctly against the inputs. This option can only be used for generalized
+    ufuncs that operate on inputs that all have the same number of core
+    dimensions and with outputs that have no core dimensions , i.e., with
+    signatures like ``(i),(i)->()`` or ``(m,m)->()``. If used, the location of
+    the dimensions in the output can be controlled with ``axes`` and ``axis``.
+
 *casting*
 
     .. versionadded:: 1.6
@@ -402,8 +435,8 @@ advanced usage and will not typically be used.
     provided by the **types** attribute of the ufunc object. For backwards
     compatibility this argument can also be provided as *sig*, although
     the long form is preferred. Note that this should not be confused with
-    the generalized ufunc signature that is stored in the **signature**
-    attribute of the of the ufunc object.
+    the generalized ufunc :ref:`signature <details-of-signature>` that is
+    stored in the **signature** attribute of the of the ufunc object.
 
 *extobj*
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
index 224436b82..913db1fab 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -3,6 +3,8 @@ Release Notes
 *************
 
 .. include:: ../release/1.15.0-notes.rst
+.. include:: ../release/1.14.4-notes.rst
+.. include:: ../release/1.14.3-notes.rst
 .. include:: ../release/1.14.2-notes.rst
 .. include:: ../release/1.14.1-notes.rst
 .. include:: ../release/1.14.0-notes.rst
diff --git a/doc/source/user/basics.io.genfromtxt.rst b/doc/source/user/basics.io.genfromtxt.rst
index 17774eeeb..21832e5aa 100644
--- a/doc/source/user/basics.io.genfromtxt.rst
+++ b/doc/source/user/basics.io.genfromtxt.rst
@@ -19,7 +19,7 @@ other faster and simpler functions like :func:`~numpy.loadtxt` cannot.
    When giving examples, we will use the following conventions::
 
        >>> import numpy as np
-       >>> from io import BytesIO
+       >>> from io import StringIO
 
 
 
@@ -30,7 +30,7 @@ The only mandatory argument of :func:`~numpy.genfromtxt` is the source of
 the data. It can be a string, a list of strings, or a generator. If a
 single string is provided, it is assumed to be the name of a local or
 remote file, or an open file-like object with a :meth:`read` method, for
-example, a file or :class:`StringIO.StringIO` object. If a list of strings
+example, a file or :class:`io.StringIO` object. If a list of strings
 or a generator returning strings is provided, each string is treated as one
 line in a file.  When the URL of a remote file is passed, the file is
 automatically downloaded to the current directory and opened.
@@ -58,8 +58,8 @@ Quite often, a single character marks the separation between columns.  For
 example, comma-separated files (CSV) use a comma (``,``) or a semicolon
 (``;``) as delimiter::
 
-   >>> data = "1, 2, 3\n4, 5, 6"
-   >>> np.genfromtxt(BytesIO(data), delimiter=",")
+   >>> data = u"1, 2, 3\n4, 5, 6"
+   >>> np.genfromtxt(StringIO(data), delimiter=",")
    array([[ 1.,  2.,  3.],
           [ 4.,  5.,  6.]])
 
@@ -74,13 +74,13 @@ defined as a given number of characters.  In that case, we need to set
 ``delimiter`` to a single integer (if all the columns have the same
 size) or to a sequence of integers (if columns can have different sizes)::
 
-   >>> data = "  1  2  3\n  4  5 67\n890123  4"
-   >>> np.genfromtxt(BytesIO(data), delimiter=3)
+   >>> data = u"  1  2  3\n  4  5 67\n890123  4"
+   >>> np.genfromtxt(StringIO(data), delimiter=3)
    array([[   1.,    2.,    3.],
           [   4.,    5.,   67.],
           [ 890.,  123.,    4.]])
-   >>> data = "123456789\n   4  7 9\n   4567 9"
-   >>> np.genfromtxt(BytesIO(data), delimiter=(4, 3, 2))
+   >>> data = u"123456789\n   4  7 9\n   4567 9"
+   >>> np.genfromtxt(StringIO(data), delimiter=(4, 3, 2))
    array([[ 1234.,   567.,    89.],
           [    4.,     7.,     9.],
           [    4.,   567.,     9.]])
@@ -94,14 +94,14 @@ individual entries are not stripped of leading nor trailing white spaces.
 This behavior can be overwritten by setting the optional argument
 ``autostrip`` to a value of ``True``::
 
-   >>> data = "1, abc , 2\n 3, xxx, 4"
+   >>> data = u"1, abc , 2\n 3, xxx, 4"
    >>> # Without autostrip
-   >>> np.genfromtxt(BytesIO(data), delimiter=",", dtype="|U5")
+   >>> np.genfromtxt(StringIO(data), delimiter=",", dtype="|U5")
    array([['1', ' abc ', ' 2'],
           ['3', ' xxx', ' 4']],
          dtype='|U5')
    >>> # With autostrip
-   >>> np.genfromtxt(BytesIO(data), delimiter=",", dtype="|U5", autostrip=True)
+   >>> np.genfromtxt(StringIO(data), delimiter=",", dtype="|U5", autostrip=True)
    array([['1', 'abc', '2'],
           ['3', 'xxx', '4']],
          dtype='|U5')
@@ -116,7 +116,7 @@ string that marks the beginning of a comment.  By default,
 occur anywhere on the line.  Any character present after the comment
 marker(s) is simply ignored::
 
-   >>> data = """#
+   >>> data = u"""#
    ... # Skip me !
    ... # Skip me too !
    ... 1, 2
@@ -126,7 +126,7 @@ marker(s) is simply ignored::
    ... # And here comes the last line
    ... 9, 0
    ... """
-   >>> np.genfromtxt(BytesIO(data), comments="#", delimiter=",")
+   >>> np.genfromtxt(StringIO(data), comments="#", delimiter=",")
    [[ 1.  2.]
     [ 3.  4.]
     [ 5.  6.]
@@ -156,10 +156,10 @@ of lines to skip at the beginning of the file, before any other action is
 performed.  Similarly, we can skip the last ``n`` lines of the file by
 using the ``skip_footer`` attribute and giving it a value of ``n``::
 
-   >>> data = "\n".join(str(i) for i in range(10))
-   >>> np.genfromtxt(BytesIO(data),)
+   >>> data = u"\n".join(str(i) for i in range(10))
+   >>> np.genfromtxt(StringIO(data),)
    array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])
-   >>> np.genfromtxt(BytesIO(data),
+   >>> np.genfromtxt(StringIO(data),
    ...               skip_header=3, skip_footer=5)
    array([ 3.,  4.])
 
@@ -180,8 +180,8 @@ integers behave the same as regular Python negative indexes.
 For example, if we want to import only the first and the last columns, we
 can use ``usecols=(0, -1)``::
 
-   >>> data = "1 2 3\n4 5 6"
-   >>> np.genfromtxt(BytesIO(data), usecols=(0, -1))
+   >>> data = u"1 2 3\n4 5 6"
+   >>> np.genfromtxt(StringIO(data), usecols=(0, -1))
    array([[ 1.,  3.],
           [ 4.,  6.]])
 
@@ -189,12 +189,12 @@ If the columns have names, we can also select which columns to import by
 giving their name to the ``usecols`` argument, either as a sequence
 of strings or a comma-separated string::
 
-   >>> data = "1 2 3\n4 5 6"
-   >>> np.genfromtxt(BytesIO(data),
+   >>> data = u"1 2 3\n4 5 6"
+   >>> np.genfromtxt(StringIO(data),
    ...               names="a, b, c", usecols=("a", "c"))
    array([(1.0, 3.0), (4.0, 6.0)],
          dtype=[('a', '<f8'), ('c', '<f8')])
-   >>> np.genfromtxt(BytesIO(data),
+   >>> np.genfromtxt(StringIO(data),
    ...               names="a, b, c", usecols=("a, c"))
        array([(1.0, 3.0), (4.0, 6.0)],
              dtype=[('a', '<f8'), ('c', '<f8')])
@@ -252,7 +252,7 @@ A natural approach when dealing with tabular data is to allocate a name to
 each column.  A first possibility is to use an explicit structured dtype,
 as mentioned previously::
 
-   >>> data = BytesIO("1 2 3\n 4 5 6")
+   >>> data = StringIO("1 2 3\n 4 5 6")
    >>> np.genfromtxt(data, dtype=[(_, int) for _ in "abc"])
    array([(1, 2, 3), (4, 5, 6)],
          dtype=[('a', '<i8'), ('b', '<i8'), ('c', '<i8')])
@@ -260,7 +260,7 @@ as mentioned previously::
 Another simpler possibility is to use the ``names`` keyword with a
 sequence of strings or a comma-separated string::
 
-   >>> data = BytesIO("1 2 3\n 4 5 6")
+   >>> data = StringIO("1 2 3\n 4 5 6")
    >>> np.genfromtxt(data, names="A, B, C")
    array([(1.0, 2.0, 3.0), (4.0, 5.0, 6.0)],
          dtype=[('A', '<f8'), ('B', '<f8'), ('C', '<f8')])
@@ -274,7 +274,7 @@ that case, we must use the ``names`` keyword with a value of
 ``True``.  The names will then be read from the first line (after the
 ``skip_header`` ones), even if the line is commented out::
 
-   >>> data = BytesIO("So it goes\n#a b c\n1 2 3\n 4 5 6")
+   >>> data = StringIO("So it goes\n#a b c\n1 2 3\n 4 5 6")
    >>> np.genfromtxt(data, skip_header=1, names=True)
    array([(1.0, 2.0, 3.0), (4.0, 5.0, 6.0)],
          dtype=[('a', '<f8'), ('b', '<f8'), ('c', '<f8')])
@@ -283,7 +283,7 @@ The default value of ``names`` is ``None``.  If we give any other
 value to the keyword, the new names will overwrite the field names we may
 have defined with the dtype::
 
-   >>> data = BytesIO("1 2 3\n 4 5 6")
+   >>> data = StringIO("1 2 3\n 4 5 6")
    >>> ndtype=[('a',int), ('b', float), ('c', int)]
    >>> names = ["A", "B", "C"]
    >>> np.genfromtxt(data, names=names, dtype=ndtype)
@@ -298,7 +298,7 @@ If ``names=None`` but a structured dtype is expected, names are defined
 with the standard NumPy default of ``"f%i"``, yielding names like ``f0``,
 ``f1`` and so forth::
 
-   >>> data = BytesIO("1 2 3\n 4 5 6")
+   >>> data = StringIO("1 2 3\n 4 5 6")
    >>> np.genfromtxt(data, dtype=(int, float, int))
    array([(1, 2.0, 3), (4, 5.0, 6)],
          dtype=[('f0', '<i8'), ('f1', '<f8'), ('f2', '<i8')])
@@ -306,7 +306,7 @@ with the standard NumPy default of ``"f%i"``, yielding names like ``f0``,
 In the same way, if we don't give enough names to match the length of the
 dtype, the missing names will be defined with this default template::
 
-   >>> data = BytesIO("1 2 3\n 4 5 6")
+   >>> data = StringIO("1 2 3\n 4 5 6")
    >>> np.genfromtxt(data, dtype=(int, float, int), names="a")
    array([(1, 2.0, 3), (4, 5.0, 6)],
          dtype=[('a', '<i8'), ('f0', '<f8'), ('f1', '<i8')])
@@ -314,7 +314,7 @@ dtype, the missing names will be defined with this default template::
 We can overwrite this default with the ``defaultfmt`` argument, that
 takes any format string::
 
-   >>> data = BytesIO("1 2 3\n 4 5 6")
+   >>> data = StringIO("1 2 3\n 4 5 6")
    >>> np.genfromtxt(data, dtype=(int, float, int), defaultfmt="var_%02i")
    array([(1, 2.0, 3), (4, 5.0, 6)],
          dtype=[('var_00', '<i8'), ('var_01', '<f8'), ('var_02', '<i8')])
@@ -377,10 +377,10 @@ In the following example, the second column is converted from as string
 representing a percentage to a float between 0 and 1::
 
    >>> convertfunc = lambda x: float(x.strip("%"))/100.
-   >>> data = "1, 2.3%, 45.\n6, 78.9%, 0"
+   >>> data = u"1, 2.3%, 45.\n6, 78.9%, 0"
    >>> names = ("i", "p", "n")
    >>> # General case .....
-   >>> np.genfromtxt(BytesIO(data), delimiter=",", names=names)
+   >>> np.genfromtxt(StringIO(data), delimiter=",", names=names)
    array([(1.0, nan, 45.0), (6.0, nan, 0.0)],
          dtype=[('i', '<f8'), ('p', '<f8'), ('n', '<f8')])
 
@@ -390,7 +390,7 @@ and ``' 78.9%'`` cannot be converted to float and we end up having
 ``np.nan`` instead.  Let's now use a converter::
 
    >>> # Converted case ...
-   >>> np.genfromtxt(BytesIO(data), delimiter=",", names=names,
+   >>> np.genfromtxt(StringIO(data), delimiter=",", names=names,
    ...               converters={1: convertfunc})
    array([(1.0, 0.023, 45.0), (6.0, 0.78900000000000003, 0.0)],
          dtype=[('i', '<f8'), ('p', '<f8'), ('n', '<f8')])
@@ -399,7 +399,7 @@ The same results can be obtained by using the name of the second column
 (``"p"``) as key instead of its index (1)::
 
    >>> # Using a name for the converter ...
-   >>> np.genfromtxt(BytesIO(data), delimiter=",", names=names,
+   >>> np.genfromtxt(StringIO(data), delimiter=",", names=names,
    ...               converters={"p": convertfunc})
    array([(1.0, 0.023, 45.0), (6.0, 0.78900000000000003, 0.0)],
          dtype=[('i', '<f8'), ('p', '<f8'), ('n', '<f8')])
@@ -411,9 +411,9 @@ string into the corresponding float or into -999 if the string is empty.
 We need to explicitly strip the string from white spaces as it is not done
 by default::
 
-   >>> data = "1, , 3\n 4, 5, 6"
+   >>> data = u"1, , 3\n 4, 5, 6"
    >>> convert = lambda x: float(x.strip() or -999)
-   >>> np.genfromtxt(BytesIO(data), delimiter=",",
+   >>> np.genfromtxt(StringIO(data), delimiter=",",
    ...               converters={1: convert})
    array([[   1., -999.,    3.],
           [   4.,    5.,    6.]])
@@ -489,13 +489,13 @@ with ``"N/A"`` in the first column and by ``"???"`` in the third column.
 We wish to transform these missing values to 0 if they occur in the first
 and second column, and to -999 if they occur in the last column::
 
-    >>> data = "N/A, 2, 3\n4, ,???"
+    >>> data = u"N/A, 2, 3\n4, ,???"
     >>> kwargs = dict(delimiter=",",
     ...               dtype=int,
     ...               names="a,b,c",
     ...               missing_values={0:"N/A", 'b':" ", 2:"???"},
     ...               filling_values={0:0, 'b':0, 2:-999})
-    >>> np.genfromtxt(BytesIO(data), **kwargs)
+    >>> np.genfromtxt(StringIO(data), **kwargs)
     array([(0, 2, 3), (4, 0, -999)],
           dtype=[('a', '<i8'), ('b', '<i8'), ('c', '<i8')])
 
diff --git a/doc/source/user/basics.rec.rst b/doc/source/user/basics.rec.rst
index 1be5af081..b885c9e77 100644
--- a/doc/source/user/basics.rec.rst
+++ b/doc/source/user/basics.rec.rst
@@ -5,3 +5,9 @@ Structured arrays
 *****************
 
 .. automodule:: numpy.doc.structured_arrays
+
+Recarray Helper Functions
+*************************
+
+.. automodule:: numpy.lib.recfunctions
+    :members:
diff --git a/doc/source/user/c-info.ufunc-tutorial.rst b/doc/source/user/c-info.ufunc-tutorial.rst
index addc38f45..5818ff182 100644
--- a/doc/source/user/c-info.ufunc-tutorial.rst
+++ b/doc/source/user/c-info.ufunc-tutorial.rst
@@ -17,8 +17,8 @@ Creating a new universal function
 Before reading this, it may help to familiarize yourself with the basics
 of C extensions for Python by reading/skimming the tutorials in Section 1
 of `Extending and Embedding the Python Interpreter
-<http://docs.python.org/extending/index.html>`_ and in `How to extend
-NumPy <http://docs.scipy.org/doc/numpy/user/c-info.how-to-extend.html>`_
+<http://docs.python.org/extending/index.html>`_ and in :doc:`How to extend
+NumPy <c-info.how-to-extend>`
 
 The umath module is a computer-generated C-module that creates many
 ufuncs. It provides a great many examples of how to create a universal
@@ -1057,6 +1057,7 @@ PyUFunc_FromFuncAndData Specification
 What follows is the full specification of PyUFunc_FromFuncAndData, which
 automatically generates a ufunc from a C function with the correct signature.
 
+.. seealso:: :c:func:`PyUFunc_FromFuncAndDataAndSignature`
 
 .. c:function:: PyObject *PyUFunc_FromFuncAndData( \
         PyUFuncGenericFunction* func, void** data, char* types, int ntypes, \
diff --git a/doc/source/user/numpy-for-matlab-users.rst b/doc/source/user/numpy-for-matlab-users.rst
index ae379624e..475c68c04 100644
--- a/doc/source/user/numpy-for-matlab-users.rst
+++ b/doc/source/user/numpy-for-matlab-users.rst
@@ -32,9 +32,9 @@ Some Key Differences
        in linear algebra.
      - In NumPy the basic type is a multidimensional ``array``.  Operations
        on these arrays in all dimensionalities including 2D are element-wise
-       operations.  However, there is a special ``matrix`` type for doing
-       linear algebra, which is just a subclass of the ``array`` class.
-       Operations on matrix-class arrays are linear algebra operations.
+       operations.  One needs to use specific functions for linear algebra
+       (though for matrix multiplication, one can use the ``@`` operator
+       in python 3.5 and above).
 
    * - MATLAB® uses 1 (one) based indexing. The initial element of a
        sequence is found using a(1).
@@ -50,8 +50,8 @@ Some Key Differences
        an excellent general-purpose programming language.  While Matlab's
        syntax for some array manipulations is more compact than
        NumPy's, NumPy (by virtue of being an add-on to Python) can do many
-       things that Matlab just cannot, for instance subclassing the main
-       array type to do both array and matrix math cleanly.
+       things that Matlab just cannot, for instance dealing properly with
+       stacks of matrices.
 
    * - In MATLAB®, arrays have pass-by-value semantics, with a lazy
        copy-on-write scheme to prevent actually creating copies until they
@@ -63,8 +63,10 @@ Some Key Differences
 'array' or 'matrix'? Which should I use?
 ========================================
 
-NumPy provides, in addition to ``np.ndarray``, an additional matrix type
-that you may see used in some existing code. Which one to use?
+Historically, NumPy has provided a special matrix type, `np.matrix`, which
+is a subclass of ndarray which makes binary operations linear algebra
+operations. You may see it used in some existing code instead of `np.array`.
+So, which one to use?
 
 Short answer
 ------------
@@ -82,6 +84,8 @@ had to use ``dot`` instead of ``*`` to multiply (reduce) two tensors
 (scalar product, matrix vector multiplication etc.). Since Python 3.5 you
 can use the matrix multiplication ``@`` operator.
 
+Given the above, we intend to deprecate ``matrix`` eventually.
+
 Long answer
 -----------
 
@@ -91,12 +95,14 @@ for many kinds of numerical computing, while ``matrix`` is intended to
 facilitate linear algebra computations specifically. In practice there
 are only a handful of key differences between the two.
 
--  Operator ``*``, ``dot()``, and ``multiply()``:
+-  Operators ``*`` and ``@``, functions ``dot()``, and ``multiply()``:
 
-   -  For ``array``, **'``*``\ ' means element-wise multiplication**,
-      and the ``dot()`` function is used for matrix multiplication.
-   -  For ``matrix``, **'``*``\ ' means matrix multiplication**, and the
-      ``multiply()`` function is used for element-wise multiplication.
+   -  For ``array``, **``*`` means element-wise multiplication**, while
+      **``@`` means matrix multiplication**; they have associated functions
+      ``multiply()`` and ``dot()``.  (Before python 3.5, ``@`` did not exist
+      and one had to use ``dot()`` for matrix multiplication).
+   -  For ``matrix``, **``*`` means matrix multiplication**, and for
+      element-wise multiplication one has to use the ``multiply()`` function.
 
 -  Handling of vectors (one-dimensional arrays)
 
@@ -132,15 +138,13 @@ There are pros and cons to using both:
 
 -  ``array``
 
+   -  ``:)`` Element-wise multiplication is easy: ``A*B``.
+   -  ``:(`` You have to remember that matrix multiplication has its own
+      operator, ``@``.
    -  ``:)`` You can treat one-dimensional arrays as *either* row or column
-      vectors. ``dot(A,v)`` treats ``v`` as a column vector, while
-      ``dot(v,A)`` treats ``v`` as a row vector. This can save you having to
+      vectors. ``A @ v`` treats ``v`` as a column vector, while
+      ``v @ A`` treats ``v`` as a row vector. This can save you having to
       type a lot of transposes.
-   -  ``<:(`` Having to use the ``dot()`` function for matrix-multiply is
-      messy -- ``dot(dot(A,B),C)`` vs. ``A*B*C``. This isn't an issue with
-      Python >= 3.5 because the ``@`` operator allows it to be written as
-      ``A @ B @ C``.
-   -  ``:)`` Element-wise multiplication is easy: ``A*B``.
    -  ``:)`` ``array`` is the "default" NumPy type, so it gets the most
       testing, and is the type most likely to be returned by 3rd party
       code that uses NumPy.
@@ -149,6 +153,8 @@ There are pros and cons to using both:
       with that.
    -  ``:)`` *All* operations (``*``, ``/``, ``+``, ``-`` etc.) are
       element-wise.
+   -  ``:(`` Sparse matrices from ``scipy.sparse`` do not interact as well
+      with arrays.
 
 -  ``matrix``
 
@@ -162,35 +168,17 @@ There are pros and cons to using both:
       argument. This shouldn't happen with NumPy functions (if it does
       it's a bug), but 3rd party code based on NumPy may not honor type
       preservation like NumPy does.
-   -  ``:)`` ``A*B`` is matrix multiplication, so more convenient for
-      linear algebra (For Python >= 3.5 plain arrays have the same convenience
-      with the ``@`` operator).
+   -  ``:)`` ``A*B`` is matrix multiplication, so it looks just like you write
+      it in linear algebra (For Python >= 3.5 plain arrays have the same
+      convenience with the ``@`` operator).
    -  ``<:(`` Element-wise multiplication requires calling a function,
       ``multiply(A,B)``.
    -  ``<:(`` The use of operator overloading is a bit illogical: ``*``
       does not work element-wise but ``/`` does.
+   -  Interaction with ``scipy.sparse`` is a bit cleaner.
 
-The ``array`` is thus much more advisable to use.
-
-Facilities for Matrix Users
-===========================
-
-NumPy has some features that facilitate the use of the ``matrix`` type,
-which hopefully make things easier for Matlab converts.
-
--  A ``matlib`` module has been added that contains matrix versions of
-   common array constructors like ``ones()``, ``zeros()``, ``empty()``,
-   ``eye()``, ``rand()``, ``repmat()``, etc. Normally these functions
-   return ``array``\ s, but the ``matlib`` versions return ``matrix``
-   objects.
--  ``mat`` has been changed to be a synonym for ``asmatrix``, rather
-   than ``matrix``, thus making it a concise way to convert an ``array``
-   to a ``matrix`` without copying the data.
--  Some top-level functions have been removed. For example
-   ``numpy.rand()`` now needs to be accessed as ``numpy.random.rand()``.
-   Or use the ``rand()`` from the ``matlib`` module. But the
-   "numpythonic" way is to use ``numpy.random.random()``, which takes a
-   tuple for the shape, like other numpy functions.
+The ``array`` is thus much more advisable to use.  Indeed, we intend to
+deprecate ``matrix`` eventually.
 
 Table of Rough MATLAB-NumPy Equivalents
 =======================================
@@ -200,23 +188,6 @@ expressions. **These are not exact equivalents**, but rather should be
 taken as hints to get you going in the right direction. For more detail
 read the built-in documentation on the NumPy functions.
 
-Some care is necessary when writing functions that take arrays or
-matrices as arguments --- if you are expecting an ``array`` and are
-given a ``matrix``, or vice versa, then '\*' (multiplication) will give
-you unexpected results. You can convert back and forth between arrays
-and matrices using
-
-- ``asarray``: always returns an object of type ``array``
-- ``asmatrix`` or ``mat``: always return an object of type
-  ``matrix``
-- ``asanyarray``: always returns an ``array`` object or a subclass
-  derived from it, depending on the input. For instance if you pass in
-  a ``matrix`` it returns a ``matrix``.
-
-These functions all accept both arrays and matrices (among other things
-like Python lists), and thus are useful when writing functions that
-should accept any array-like object.
-
 In the table below, it is assumed that you have executed the following
 commands in Python:
 
@@ -309,8 +280,7 @@ Linear Algebra Equivalents
      - 2x3 matrix literal
 
    * - ``[ a b; c d ]``
-     - ``vstack([hstack([a,b]), hstack([c,d])])`` or
-       ``block([[a, b], [c, d])``
+     - ``block([[a,b], [c,d]])``
      - construct a matrix from blocks ``a``, ``b``, ``c``, and ``d``
 
    * - ``a(end)``
@@ -369,7 +339,7 @@ Linear Algebra Equivalents
      - conjugate transpose of ``a``
 
    * - ``a * b``
-     - ``a.dot(b)`` or ``a@b`` (Python 3.5 or newer)
+     - ``a @ b``
      - matrix multiply
 
    * - ``a .* b``
@@ -520,7 +490,7 @@ Linear Algebra Equivalents
        from each pair
 
    * - ``norm(v)``
-     - ``sqrt(dot(v,v))`` or ``np.linalg.norm(v)``
+     - ``sqrt(v @ v)`` or ``np.linalg.norm(v)``
      - L2 norm of vector ``v``
 
    * - ``a & b``
diff --git a/doc/source/user/quickstart.rst b/doc/source/user/quickstart.rst
index de4079080..57a7004cc 100644
--- a/doc/source/user/quickstart.rst
+++ b/doc/source/user/quickstart.rst
@@ -297,19 +297,19 @@ created and filled with the result.
 
 Unlike in many matrix languages, the product operator ``*`` operates
 elementwise in NumPy arrays. The matrix product can be performed using
-the ``dot`` function or method::
+the ``@`` operator (in python >=3.5) or the ``dot`` function or method::
 
     >>> A = np.array( [[1,1],
     ...             [0,1]] )
     >>> B = np.array( [[2,0],
     ...             [3,4]] )
-    >>> A*B                         # elementwise product
+    >>> A * B                       # elementwise product
     array([[2, 0],
            [0, 4]])
-    >>> A.dot(B)                    # matrix product
+    >>> A @ B                       # matrix product
     array([[5, 4],
            [3, 4]])
-    >>> np.dot(A, B)                # another matrix product
+    >>> A.dot(B)                    # another matrix product
     array([[5, 4],
            [3, 4]])
 
@@ -1357,7 +1357,7 @@ See linalg.py in numpy folder for more.
            [ 0.,  1.]])
     >>> j = np.array([[0.0, -1.0], [1.0, 0.0]])
 
-    >>> np.dot (j, j) # matrix product
+    >>> j @ j        # matrix product
     array([[-1.,  0.],
            [ 0., -1.]])
 
diff --git a/numpy/__init__.py b/numpy/__init__.py
index d10a1ecd3..d250ed5ac 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -194,3 +194,28 @@ else:
     from numpy.testing._private.pytesttester import PytestTester
     test = PytestTester(__name__)
     del PytestTester
+
+
+    def _sanity_check():
+        """
+        Quick sanity checks for common bugs caused by environment.
+        There are some cases e.g. with wrong BLAS ABI that cause wrong
+        results under specific runtime conditions that are not necessarily
+        achieved during test suite runs, and it is useful to catch those early.
+
+        See https://github.com/numpy/numpy/issues/8577 and other
+        similar bug reports.
+
+        """
+        try:
+            x = ones(2, dtype=float32)
+            if not abs(x.dot(x) - 2.0) < 1e-5:
+                raise AssertionError()
+        except AssertionError:
+            msg = ("The current Numpy installation ({!r}) fails to "
+                   "pass simple sanity checks. This can be caused for example "
+                   "by incorrect BLAS library being linked in.")
+            raise RuntimeError(msg.format(__file__))
+
+    _sanity_check()
+    del _sanity_check
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 93a521658..9372b3431 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -385,10 +385,11 @@ add_newdoc('numpy.core', 'nditer',
         array([  0.5,   1.5,   4.5,   9.5,  16.5])
 
     If operand flags `"writeonly"` or `"readwrite"` are used the operands may
-    be views into the original data with the WRITEBACKIFCOPY flag. In this case
-    nditer must be used as a context manager. The temporary
-    data will be written back to the original data when the `` __exit__``
-    function is called but not before::
+    be views into the original data with the `WRITEBACKIFCOPY` flag. In this case
+    nditer must be used as a context manager or the nditer.close
+    method must be called before using the result. The temporary
+    data will be written back to the original data when the `__exit__`
+    function is called but not before:
 
         >>> a = np.arange(6, dtype='i4')[::-2]
         >>> with nditer(a, [],
@@ -405,7 +406,7 @@ add_newdoc('numpy.core', 'nditer',
     references (like `x` in the example) may or may not share data with
     the original data `a`. If writeback semantics were active, i.e. if
     `x.base.flags.writebackifcopy` is `True`, then exiting the iterator
-     will sever the connection between `x` and `a`, writing to `x` will
+    will sever the connection between `x` and `a`, writing to `x` will
     no longer write to `a`. If writeback semantics are not active, then
     `x.data` will still point at some part of `a.data`, and writing to
     one will affect the other.
@@ -566,6 +567,11 @@ add_newdoc('numpy.core', 'nditer', ('close',
 
     Resolve all writeback semantics in writeable operands.
 
+    See Also
+    --------
+
+    :ref:`nditer-context-manager`
+
     """))
 
 
@@ -4753,6 +4759,11 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('tofile',
     machines with different endianness. Some of these problems can be overcome
     by outputting the data as text files, at the expense of speed and file
     size.
+    
+    When fid is a file object, array contents are directly written to the
+    file, bypassing the file object's ``write`` method. As a result, tofile
+    cannot be used with files objects supporting compression (e.g., GzipFile)
+    or file-like objects that do not support ``fileno()`` (e.g., BytesIO).
 
     """))
 
@@ -5596,6 +5607,37 @@ add_newdoc('numpy.core.multiarray', 'unpackbits',
 
     """)
 
+add_newdoc('numpy.core._multiarray_tests', 'format_float_OSprintf_g',
+    """
+    format_float_OSprintf_g(val, precision)
+
+    Print a floating point scalar using the system's printf function,
+    equivalent to:
+
+        printf("%.*g", precision, val);
+
+    for half/float/double, or replacing 'g' by 'Lg' for longdouble. This
+    method is designed to help cross-validate the format_float_* methods.
+
+    Parameters
+    ----------
+    val : python float or numpy floating scalar
+        Value to format.
+
+    precision : non-negative integer, optional
+        Precision given to printf.
+
+    Returns
+    -------
+    rep : string
+        The string representation of the floating point value
+
+    See Also
+    --------
+    format_float_scientific
+    format_float_positional
+    """)
+
 
 ##############################################################################
 #
@@ -5641,10 +5683,13 @@ add_newdoc('numpy.core', 'ufunc',
         Alternate array object(s) in which to put the result; if provided, it
         must have a shape that the inputs broadcast to. A tuple of arrays
         (possible only as a keyword argument) must have length equal to the
-        number of outputs; use `None` for outputs to be allocated by the ufunc.
+        number of outputs; use `None` for uninitialized outputs to be
+        allocated by the ufunc.
     where : array_like, optional
         Values of True indicate to calculate the ufunc at that position, values
-        of False indicate to leave the value in the output alone.
+        of False indicate to leave the value in the output alone.  Note that if
+        an uninitialized return array is created via the default ``out=None``,
+        then the elements where the values are False will remain uninitialized.
     **kwargs
         For other keyword-only arguments, see the :ref:`ufunc docs <ufuncs.kwargs>`.
 
@@ -5652,7 +5697,8 @@ add_newdoc('numpy.core', 'ufunc',
     -------
     r : ndarray or tuple of ndarray
         `r` will have the shape that the arrays in `x` broadcast to; if `out` is
-        provided, `r` will be equal to `out`. If the function has more than one
+        provided, it will be returned. If not, `r` will be allocated and
+        may contain uninitialized values. If the function has more than one
         output, then the result will be a tuple of arrays.
 
     """)
@@ -5850,7 +5896,7 @@ add_newdoc('numpy.core', 'ufunc', ('signature',
 
 add_newdoc('numpy.core', 'ufunc', ('reduce',
     """
-    reduce(a, axis=0, dtype=None, out=None, keepdims=False)
+    reduce(a, axis=0, dtype=None, out=None, keepdims=False, initial)
 
     Reduces `a`'s dimension by one, by applying ufunc along one axis.
 
@@ -5906,6 +5952,14 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
         the result will broadcast correctly against the original `arr`.
 
         .. versionadded:: 1.7.0
+    initial : scalar, optional
+        The value with which to start the reduction.
+        If the ufunc has no identity or the dtype is object, this defaults
+        to None - otherwise it defaults to ufunc.identity.
+        If ``None`` is given, the first element of the reduction is used,
+        and an error is thrown if the reduction is empty.
+        
+        .. versionadded:: 1.15.0
 
     Returns
     -------
@@ -5937,7 +5991,24 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
     >>> np.add.reduce(X, 2)
     array([[ 1,  5],
            [ 9, 13]])
-
+           
+    You can use the ``initial`` keyword argument to initialize the reduction with a
+    different value.
+    
+    >>> np.add.reduce([10], initial=5)
+    15
+    >>> np.add.reduce(np.ones((2, 2, 2)), axis=(0, 2), initializer=10)
+    array([14., 14.])
+    
+    Allows reductions of empty arrays where they would normally fail, i.e.
+    for ufuncs without an identity.
+    
+    >>> np.minimum.reduce([], initial=np.inf)
+    inf
+    >>> np.minimum.reduce([])
+    Traceback (most recent call last):
+        ...
+    ValueError: zero-size array to reduction operation minimum which has no identity
     """))
 
 add_newdoc('numpy.core', 'ufunc', ('accumulate',
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index 0f928676b..33f6d01a8 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -11,6 +11,7 @@ from numpy.core import multiarray as mu
 from numpy.core import umath as um
 from numpy.core.numeric import asanyarray
 from numpy.core import numerictypes as nt
+from numpy._globals import _NoValue
 
 # save those O(100) nanoseconds!
 umr_maximum = um.maximum.reduce
@@ -22,17 +23,21 @@ umr_all = um.logical_and.reduce
 
 # avoid keyword arguments to speed up parsing, saves about 15%-20% for very
 # small reductions
-def _amax(a, axis=None, out=None, keepdims=False):
-    return umr_maximum(a, axis, None, out, keepdims)
+def _amax(a, axis=None, out=None, keepdims=False,
+          initial=_NoValue):
+    return umr_maximum(a, axis, None, out, keepdims, initial)
 
-def _amin(a, axis=None, out=None, keepdims=False):
-    return umr_minimum(a, axis, None, out, keepdims)
+def _amin(a, axis=None, out=None, keepdims=False,
+          initial=_NoValue):
+    return umr_minimum(a, axis, None, out, keepdims, initial)
 
-def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
-    return umr_sum(a, axis, dtype, out, keepdims)
+def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
+         initial=_NoValue):
+    return umr_sum(a, axis, dtype, out, keepdims, initial)
 
-def _prod(a, axis=None, dtype=None, out=None, keepdims=False):
-    return umr_prod(a, axis, dtype, out, keepdims)
+def _prod(a, axis=None, dtype=None, out=None, keepdims=False,
+          initial=_NoValue):
+    return umr_prod(a, axis, dtype, out, keepdims, initial)
 
 def _any(a, axis=None, dtype=None, out=None, keepdims=False):
     return umr_any(a, axis, dtype, out, keepdims)
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index f39248bd0..6d15cb23f 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -1088,7 +1088,7 @@ def format_float_positional(x, precision=None, unique=True,
 
     Examples
     --------
-    >>> np.format_float_scientific(np.float32(np.pi))
+    >>> np.format_float_positional(np.float32(np.pi))
     '3.1415927'
     >>> np.format_float_positional(np.float16(np.pi))
     '3.14'
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 7492baf9d..632bcb41f 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -908,8 +908,8 @@ if sys.version_info[0] >= 3:
     del defdict['divide']
 
 def indent(st, spaces):
-    indention = ' '*spaces
-    indented = indention + st.replace('\n', '\n'+indention)
+    indentation = ' '*spaces
+    indented = indentation + st.replace('\n', '\n'+indentation)
     # trim off any trailing spaces
     indented = re.sub(r' +$', r'', indented)
     return indented
@@ -972,7 +972,7 @@ def make_arrays(funcdict):
                     for vt in t.simd:
                         code2list.append(textwrap.dedent("""\
                         #ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
-                        if (npy_cpu_supports("{ISA}")) {{
+                        if (npy_cpu_supports("{isa}")) {{
                             {fname}_functions[{idx}] = {type}_{fname}_{isa};
                         }}
                         #endif
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 157fa3447..6cfbbbcc7 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -6,7 +6,7 @@ Each dictionary contains name -> index pair.
 Whenever you change one index, you break the ABI (and the ABI version number
 should be incremented). Whenever you add an item to one of the dict, the API
 needs to be updated in both setup_common.py and by adding an appropriate
-entry to cversion.txt (generate the hash via "python cversions.py".
+entry to cversion.txt (generate the hash via "python cversions.py").
 
 When adding a function, make sure to use the next integer not used as an index
 (in case you use an existing index or jump, the build will stop and raise an
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index bd90d0460..f7d58a26f 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -3365,7 +3365,7 @@ add_newdoc('numpy.core.umath', 'sinh',
 
 add_newdoc('numpy.core.umath', 'sqrt',
     """
-    Return the positive square-root of an array, element-wise.
+    Return the non-negative square-root of an array, element-wise.
 
     Parameters
     ----------
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index 8cd6eae12..a4c18d482 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -1109,7 +1109,7 @@ def einsum(*operands, **kwargs):
             # Checks have already been handled
             input_str, results_index = einsum_str.split('->')
             input_left, input_right = input_str.split(',')
-            if 1 in tmp_operands[0] or 1 in tmp_operands[1]:
+            if 1 in tmp_operands[0].shape or 1 in tmp_operands[1].shape:
                 left_dims = {dim: size for dim, size in
                              zip(input_left, tmp_operands[0].shape)}
                 right_dims = {dim: size for dim, size in
@@ -1148,7 +1148,7 @@ def einsum(*operands, **kwargs):
             # Do the contraction
             new_view = c_einsum(einsum_str, *tmp_operands, **einsum_kwargs)
 
-        # Append new items and derefernce what we can
+        # Append new items and dereference what we can
         operands.append(new_view)
         del tmp_operands, new_view
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 948c2139d..d1aae0aa0 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -140,6 +140,7 @@ def take(a, indices, axis=None, out=None, mode='raise'):
     --------
     compress : Take elements using a boolean mask
     ndarray.take : equivalent method
+    take_along_axis : Take elements by matching the array and the index arrays
 
     Notes
     -----
@@ -478,6 +479,7 @@ def put(a, ind, v, mode='raise'):
     See Also
     --------
     putmask, place
+    put_along_axis : Put elements by matching the array and the index arrays
 
     Examples
     --------
@@ -723,7 +725,9 @@ def argpartition(a, kth, axis=-1, kind='introselect', order=None):
     -------
     index_array : ndarray, int
         Array of indices that partition `a` along the specified axis.
-        In other words, ``a[index_array]`` yields a partitioned `a`.
+        If `a` is one-dimensional, ``a[index_array]`` yields a partitioned `a`.
+        More generally, ``np.take_along_axis(a, index_array, axis=a)`` always
+        yields the partitioned `a`, irrespective of dimensionality.
 
     See Also
     --------
@@ -904,6 +908,8 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
     index_array : ndarray, int
         Array of indices that sort `a` along the specified axis.
         If `a` is one-dimensional, ``a[index_array]`` yields a sorted `a`.
+        More generally, ``np.take_along_axis(a, index_array, axis=a)`` always
+        yields the sorted `a`, irrespective of dimensionality.
 
     See Also
     --------
@@ -1336,10 +1342,11 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
     Returns
     -------
     array_of_diagonals : ndarray
-        If `a` is 2-D and not a `matrix`, a 1-D array of the same type as `a`
-        containing the diagonal is returned. If `a` is a `matrix`, a 1-D
-        array containing the diagonal is returned in order to maintain
-        backward compatibility.
+        If `a` is 2-D, then a 1-D array containing the diagonal and of the
+        same type as `a` is returned unless `a` is a `matrix`, in which case
+        a 1-D array rather than a (2-D) `matrix` is returned in order to
+        maintain backward compatibility.
+        
         If ``a.ndim > 2``, then the dimensions specified by `axis1` and `axis2`
         are removed, and a new axis inserted at the end corresponding to the
         diagonal.
@@ -1496,10 +1503,9 @@ def ravel(a, order='C'):
     Returns
     -------
     y : array_like
-        If `a` is a matrix, y is a 1-D ndarray, otherwise y is an array of
-        the same subtype as `a`. The shape of the returned array is
-        ``(a.size,)``. Matrices are special cased for backward
-        compatibility.
+        y is an array of the same subtype as `a`, with shape ``(a.size,)``.
+        Note that matrices are special cased for backward compatibility, if `a`
+        is a matrix, then y is a 1-D ndarray.
 
     See Also
     --------
@@ -1812,7 +1818,7 @@ def clip(a, a_min, a_max, out=None):
     return _wrapfunc(a, 'clip', a_min, a_max, out=out)
 
 
-def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
+def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
     """
     Sum of array elements over a given axis.
 
@@ -1851,6 +1857,10 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         `ndarray`, however any non-default value will be.  If the
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
+    initial : scalar, optional
+        Starting value for the sum. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.15.0
 
     Returns
     -------
@@ -1898,6 +1908,10 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
     >>> np.ones(128, dtype=np.int8).sum(dtype=np.int8)
     -128
 
+    You can also start the sum with a value other than zero:
+
+    >>> np.sum([10], initial=5)
+    15
     """
     if isinstance(a, _gentype):
         # 2018-02-25, 1.15.0
@@ -1912,7 +1926,8 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
             return out
         return res
 
-    return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims)
+    return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
+                          initial=initial)
 
 
 def any(a, axis=None, out=None, keepdims=np._NoValue):
@@ -2209,7 +2224,7 @@ def ptp(a, axis=None, out=None, keepdims=np._NoValue):
     return _methods._ptp(a, axis=axis, out=out, **kwargs)
 
 
-def amax(a, axis=None, out=None, keepdims=np._NoValue):
+def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
     """
     Return the maximum of an array or maximum along an axis.
 
@@ -2241,6 +2256,13 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    initial : scalar, optional
+        The minimum value of an output element. Must be present to allow
+        computation on empty slice. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.15.0
+
+
     Returns
     -------
     amax : ndarray or scalar
@@ -2293,11 +2315,26 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue):
     >>> np.nanmax(b)
     4.0
 
+    You can use an initial value to compute the maximum of an empty slice, or
+    to initialize it to a different value:
+
+    >>> np.max([[-50], [10]], axis=-1, initial=0)
+    array([ 0, 10])
+
+    Notice that the initial value is used as one of the elements for which the
+    maximum is determined, unlike for the default argument Python's max
+    function, which is only used for empty iterables.
+
+    >>> np.max([5], initial=6)
+    6
+    >>> max([5], default=6)
+    5
     """
-    return _wrapreduction(a, np.maximum, 'max', axis, None, out, keepdims=keepdims)
+    return _wrapreduction(a, np.maximum, 'max', axis, None, out, keepdims=keepdims,
+                          initial=initial)
 
 
-def amin(a, axis=None, out=None, keepdims=np._NoValue):
+def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
     """
     Return the minimum of an array or minimum along an axis.
 
@@ -2329,6 +2366,12 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    initial : scalar, optional
+        The maximum value of an output element. Must be present to allow
+        computation on empty slice. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.15.0
+
     Returns
     -------
     amin : ndarray or scalar
@@ -2381,8 +2424,22 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue):
     >>> np.nanmin(b)
     0.0
 
+    >>> np.min([[-50], [10]], axis=-1, initial=0)
+    array([-50,   0])
+
+    Notice that the initial value is used as one of the elements for which the
+    minimum is determined, unlike for the default argument Python's max
+    function, which is only used for empty iterables.
+
+    Notice that this isn't the same as Python's ``default`` argument.
+
+    >>> np.min([6], initial=5)
+    5
+    >>> min([6], default=5)
+    6
     """
-    return _wrapreduction(a, np.minimum, 'min', axis, None, out, keepdims=keepdims)
+    return _wrapreduction(a, np.minimum, 'min', axis, None, out, keepdims=keepdims,
+                          initial=initial)
 
 
 def alen(a):
@@ -2418,7 +2475,7 @@ def alen(a):
         return len(array(a, ndmin=1))
 
 
-def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
+def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
     """
     Return the product of array elements over a given axis.
 
@@ -2458,6 +2515,10 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         `ndarray`, however any non-default value will be.  If the
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
+    initial : scalar, optional
+        The starting value for this product. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.15.0
 
     Returns
     -------
@@ -2515,8 +2576,13 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
     >>> np.prod(x).dtype == int
     True
 
+    You can also start the product with a value other than one:
+
+    >>> np.prod([1, 2], initial=5)
+    10
     """
-    return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out, keepdims=keepdims)
+    return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out, keepdims=keepdims,
+                          initial=initial)
 
 
 def cumprod(a, axis=None, dtype=None, out=None):
diff --git a/numpy/core/include/numpy/npy_interrupt.h b/numpy/core/include/numpy/npy_interrupt.h
index f71fd689e..40cb7ac5e 100644
--- a/numpy/core/include/numpy/npy_interrupt.h
+++ b/numpy/core/include/numpy/npy_interrupt.h
@@ -55,7 +55,7 @@ Ideas:
 Simple Interface:
 
 
-In your C-extension: around a block of code you want to be interruptable
+In your C-extension: around a block of code you want to be interruptible
 with a SIGINT
 
 NPY_SIGINT_ON
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index ba32bcdd3..582390cdc 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -524,8 +524,17 @@ npy_clongdouble npy_catanhl(npy_clongdouble z);
 #define NPY_FPE_UNDERFLOW     4
 #define NPY_FPE_INVALID       8
 
-int npy_get_floatstatus(void);
+int npy_clear_floatstatus_barrier(char*);
+int npy_get_floatstatus_barrier(char*);
+/*
+ * use caution with these - clang and gcc8.1 are known to reorder calls
+ * to this form of the function which can defeat the check. The _barrier
+ * form of the call is preferable, where the argument is
+ * (char*)&local_variable
+ */
 int npy_clear_floatstatus(void);
+int npy_get_floatstatus(void);
+
 void npy_set_floatstatus_divbyzero(void);
 void npy_set_floatstatus_overflow(void);
 void npy_set_floatstatus_underflow(void);
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index d0ac1fd7d..4b1b3d325 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -167,7 +167,7 @@ typedef struct _tagPyUFuncObject {
         int *core_dim_ixs;
         /*
          * positions of 1st core dimensions of each
-         * argument in core_dim_ixs
+         * argument in core_dim_ixs, equivalent to cumsum(core_num_dims)
          */
         int *core_offsets;
         /* signature string for printing purpose */
diff --git a/numpy/core/memmap.py b/numpy/core/memmap.py
index 5be45affd..b2ff0e793 100644
--- a/numpy/core/memmap.py
+++ b/numpy/core/memmap.py
@@ -236,11 +236,10 @@ class memmap(ndarray):
                 raise ValueError("Size of available data is not a "
                         "multiple of the data-type size.")
             size = bytes // _dbytes
-            shape = (size,)
         else:
             if not isinstance(shape, tuple):
                 shape = (shape,)
-            size = 1
+            size = np.intp(1)  # avoid default choice of np.int_, which might overflow
             for k in shape:
                 size *= k
 
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 1108d4667..7ade3d224 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -489,9 +489,9 @@ def asarray(a, dtype=None, order=None):
 
     Contrary to `asanyarray`, ndarray subclasses are not passed through:
 
-    >>> issubclass(np.matrix, np.ndarray)
+    >>> issubclass(np.recarray, np.ndarray)
     True
-    >>> a = np.matrix([[1, 2]])
+    >>> a = np.array([(1.0, 2), (3.0, 4)], dtype='f4,i4').view(np.recarray)
     >>> np.asarray(a) is a
     False
     >>> np.asanyarray(a) is a
@@ -545,7 +545,7 @@ def asanyarray(a, dtype=None, order=None):
 
     Instances of `ndarray` subclasses are passed through as-is:
 
-    >>> a = np.matrix([1, 2])
+    >>> a = np.array([(1.0, 2), (3.0, 4)], dtype='f4,i4').view(np.recarray)
     >>> np.asanyarray(a) is a
     True
 
@@ -2035,7 +2035,7 @@ def binary_repr(num, width=None):
     '11101'
 
     """
-    def warn_if_insufficient(width, binwdith):
+    def warn_if_insufficient(width, binwidth):
         if width is not None and width < binwidth:
             warnings.warn(
                 "Insufficient bit width provided. This behavior "
@@ -2280,7 +2280,7 @@ def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
     relative difference (`rtol` * abs(`b`)) and the absolute difference
     `atol` are added together to compare against the absolute difference
     between `a` and `b`.
-    
+
     .. warning:: The default `atol` is not appropriate for comparing numbers
                  that are much smaller than one (see Notes).
 
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 15f6e1522..f826b278f 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -452,17 +452,8 @@ def configuration(parent_package='',top_path=None):
                 moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1))
 
             # Get long double representation
-            if sys.platform != 'darwin':
-                rep = check_long_double_representation(config_cmd)
-                if rep in ['INTEL_EXTENDED_12_BYTES_LE',
-                           'INTEL_EXTENDED_16_BYTES_LE',
-                           'MOTOROLA_EXTENDED_12_BYTES_BE',
-                           'IEEE_QUAD_LE', 'IEEE_QUAD_BE',
-                           'IEEE_DOUBLE_LE', 'IEEE_DOUBLE_BE',
-                           'DOUBLE_DOUBLE_BE', 'DOUBLE_DOUBLE_LE']:
-                    moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1))
-                else:
-                    raise ValueError("Unrecognized long double format: %s" % rep)
+            rep = check_long_double_representation(config_cmd)
+            moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1))
 
             # Py3K check
             if sys.version_info[0] == 3:
@@ -664,7 +655,7 @@ def configuration(parent_package='',top_path=None):
     def get_mathlib_info(*args):
         # Another ugly hack: the mathlib info is known once build_src is run,
         # but we cannot use add_installed_pkg_config here either, so we only
-        # update the substition dictionary during npymath build
+        # update the substitution dictionary during npymath build
         config_cmd = config.get_config_cmd()
 
         # Check that the toolchain works, to fail early if it doesn't
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index a8aba40bd..70a43046c 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -335,9 +335,9 @@ _MOTOROLA_EXTENDED_12B = ['300', '031', '000', '000', '353', '171',
 _IEEE_QUAD_PREC_BE = ['300', '031', '326', '363', '105', '100', '000', '000',
                       '000', '000', '000', '000', '000', '000', '000', '000']
 _IEEE_QUAD_PREC_LE = _IEEE_QUAD_PREC_BE[::-1]
-_DOUBLE_DOUBLE_BE = (['301', '235', '157', '064', '124', '000', '000', '000'] +
+_IBM_DOUBLE_DOUBLE_BE = (['301', '235', '157', '064', '124', '000', '000', '000'] +
                      ['000'] * 8)
-_DOUBLE_DOUBLE_LE = (['000', '000', '000', '124', '064', '157', '235', '301'] +
+_IBM_DOUBLE_DOUBLE_LE = (['000', '000', '000', '124', '064', '157', '235', '301'] +
                      ['000'] * 8)
 
 def long_double_representation(lines):
@@ -364,11 +364,16 @@ def long_double_representation(lines):
             # the long double
             if read[-8:] == _AFTER_SEQ:
                 saw = copy.copy(read)
+                # if the content was 12 bytes, we only have 32 - 8 - 12 = 12
+                # "before" bytes. In other words the first 4 "before" bytes went
+                # past the sliding window.
                 if read[:12] == _BEFORE_SEQ[4:]:
                     if read[12:-8] == _INTEL_EXTENDED_12B:
                         return 'INTEL_EXTENDED_12_BYTES_LE'
                     if read[12:-8] == _MOTOROLA_EXTENDED_12B:
                         return 'MOTOROLA_EXTENDED_12_BYTES_BE'
+                # if the content was 16 bytes, we are left with 32-8-16 = 16
+                # "before" bytes, so 8 went past the sliding window.
                 elif read[:8] == _BEFORE_SEQ[8:]:
                     if read[8:-8] == _INTEL_EXTENDED_16B:
                         return 'INTEL_EXTENDED_16_BYTES_LE'
@@ -376,10 +381,11 @@ def long_double_representation(lines):
                         return 'IEEE_QUAD_BE'
                     elif read[8:-8] == _IEEE_QUAD_PREC_LE:
                         return 'IEEE_QUAD_LE'
-                    elif read[8:-8] == _DOUBLE_DOUBLE_BE:
-                        return 'DOUBLE_DOUBLE_BE'
-                    elif read[8:-8] == _DOUBLE_DOUBLE_LE:
-                        return 'DOUBLE_DOUBLE_LE'
+                    elif read[8:-8] == _IBM_DOUBLE_DOUBLE_LE:
+                        return 'IBM_DOUBLE_DOUBLE_LE'
+                    elif read[8:-8] == _IBM_DOUBLE_DOUBLE_BE:
+                        return 'IBM_DOUBLE_DOUBLE_BE'
+                # if the content was 8 bytes, left with 32-8-8 = 16 bytes
                 elif read[:16] == _BEFORE_SEQ:
                     if read[16:-8] == _IEEE_DOUBLE_LE:
                         return 'IEEE_DOUBLE_LE'
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index 0299f1a1b..cba96a4c2 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -3,7 +3,9 @@
 #include <Python.h>
 #define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */
 #include "numpy/arrayobject.h"
+#include "numpy/arrayscalars.h"
 #include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
 #include "mem_overlap.h"
 #include "npy_extint128.h"
 #include "common.h"
@@ -1828,6 +1830,63 @@ call_npy_@name@@suffix@(PyObject *NPY_UNUSED(self), PyObject *args)
 
 /**end repeat**/
 
+/*
+ * For development/testing purposes, it's convenient to have access to the
+ * system printf for floats. This is a very simple printf interface.
+ */
+PyObject *
+PrintFloat_Printf_g(PyObject *obj, int precision)
+{
+    char str[1024];
+
+    if (PyArray_IsScalar(obj, Half)) {
+        npy_half x = ((PyHalfScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision,
+                      npy_half_to_double(x));
+    }
+    else if (PyArray_IsScalar(obj, Float)) {
+        npy_float x = ((PyFloatScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, x);
+    }
+    else if (PyArray_IsScalar(obj, Double)) {
+        npy_double x = ((PyDoubleScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, x);
+        /* would be better to use lg, but not available in C90 */
+    }
+    else if (PyArray_IsScalar(obj, LongDouble)) {
+        npy_longdouble x = ((PyLongDoubleScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*Lg", precision, x);
+    }
+    else{
+        double val = PyFloat_AsDouble(obj);
+        if (error_converting(val)) {
+            return NULL;
+        }
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, val);
+    }
+
+    return PyUString_FromString(str);
+}
+
+
+static PyObject *
+printf_float_g(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+{
+    PyObject *obj;
+    int precision;
+
+    if (!PyArg_ParseTuple(args,"Oi:format_float_OSprintf_g", &obj,
+                                                             &precision)) {
+        return NULL;
+    }
+
+    if (precision < 0) {
+        PyErr_SetString(PyExc_TypeError, "precision must be non-negative");
+        return NULL;
+    }
+
+    return PrintFloat_Printf_g(obj, precision);
+}
 
 static PyMethodDef Multiarray_TestsMethods[] = {
     {"IsPythonScalar",
@@ -1967,7 +2026,9 @@ static PyMethodDef Multiarray_TestsMethods[] = {
 /**end repeat1**/
 
 /**end repeat**/
-
+    {"format_float_OSprintf_g",
+        (PyCFunction)printf_float_g,
+        METH_VARARGS , NULL},
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index 3d259ae05..17de99cb9 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -245,6 +245,10 @@ PyArray_AssignRawScalar(PyArrayObject *dst,
             allocated_src_data = 1;
         }
 
+        if (PyDataType_FLAGCHK(PyArray_DESCR(dst), NPY_NEEDS_INIT)) {
+            memset(tmp_src_data, 0, PyArray_DESCR(dst)->elsize);
+        }
+
         if (PyArray_CastRawArrays(1, src_data, tmp_src_data, 0, 0,
                             src_dtype, PyArray_DESCR(dst), 0) != NPY_SUCCEED) {
             src_data = tmp_src_data;
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 69538c6b7..e1db4d6f6 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -86,7 +86,8 @@ NPY_NO_EXPORT int
 PyArray_SetUpdateIfCopyBase(PyArrayObject *arr, PyArrayObject *base)
 {
     int ret;
-    /* 2017-Nov-10 1.14 */
+    /* 2017-Nov  -10 1.14 (for PyPy only) */
+    /* 2018-April-21 1.15 (all Python implementations) */
     if (DEPRECATE("PyArray_SetUpdateIfCopyBase is deprecated, use "
               "PyArray_SetWritebackIfCopyBase instead, and be sure to call "
               "PyArray_ResolveWritebackIfCopy before the array is deallocated, "
@@ -1662,7 +1663,6 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
             descr = NULL;
             goto fail;
         }
-        PyArray_UpdateFlags(ret, NPY_ARRAY_UPDATE_ALL);
         Py_INCREF(buffer.base);
         if (PyArray_SetBaseObject(ret, buffer.base) < 0) {
             Py_DECREF(ret);
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 5e6804a5c..48003e6a3 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -182,6 +182,15 @@ npy_strtoull(const char *str, char **endptr, int base)
  *****************************************************************************
  */
 
+#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+/*
+ * Disable harmless compiler warning "4116: unnamed type definition in
+ * parentheses" which is caused by the _ALIGN macro.
+ */
+#if defined(_MSC_VER)
+#pragma warning(disable:4116)
+#endif
+
 
 /**begin repeat
  *
@@ -246,8 +255,10 @@ static int
         }
         return -1;
     }
-    if (ap == NULL || PyArray_ISBEHAVED(ap))
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, _ALIGN(@type@)));
         *((@type@ *)ov)=temp;
+    }
     else {
         PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
                                        ap);
@@ -655,9 +666,7 @@ VOID_getitem(void *input, void *vap)
 {
     PyArrayObject *ap = vap;
     char *ip = input;
-    PyArrayObject *u = NULL;
     PyArray_Descr* descr;
-    int itemsize;
 
     descr = PyArray_DESCR(ap);
     if (PyDataType_HASFIELDS(descr)) {
@@ -727,72 +736,10 @@ VOID_getitem(void *input, void *vap)
             Py_DECREF(ret);
             return NULL;
         }
-        PyArray_UpdateFlags((PyArrayObject *)ret, NPY_ARRAY_UPDATE_ALL);
         return (PyObject *)ret;
     }
 
-    /* 2017-11-26, 1.14 */
-    if (DEPRECATE_FUTUREWARNING(
-            "the `.item()` method of unstructured void types will return an "
-            "immutable `bytes` object in the near future, the same as "
-            "returned by `bytes(void_obj)`, instead of the mutable memoryview "
-            "or integer array returned in numpy 1.13.") < 0) {
-        return NULL;
-    }
-    /*
-     * In the future all the code below will be replaced by
-     *
-     *       For unstructured void types like V4, return a bytes object (copy).
-     *     return PyBytes_FromStringAndSize(PyArray_DATA(ap), descr->elsize);
-     */
-
-    if (PyDataType_FLAGCHK(descr, NPY_ITEM_HASOBJECT)
-            || PyDataType_FLAGCHK(descr, NPY_ITEM_IS_POINTER)) {
-        PyErr_SetString(PyExc_ValueError,
-                "tried to get void-array with object members as buffer.");
-        return NULL;
-    }
-    itemsize = PyArray_DESCR(ap)->elsize;
-
-#if defined(NPY_PY3K)
-    /*
-     * Return a byte array; there are no plain buffer objects on Py3
-     */
-    {
-        npy_intp dims[1], strides[1];
-        dims[0] = itemsize;
-        strides[0] = 1;
-        descr = PyArray_DescrNewFromType(NPY_BYTE);
-        u = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
-                             descr, 1, dims, strides, ip,
-                             PyArray_ISWRITEABLE(ap) ? NPY_ARRAY_WRITEABLE : 0,
-                             NULL);
-        Py_INCREF(ap);
-        if (PyArray_SetBaseObject(u, (PyObject *)ap) < 0) {
-            Py_DECREF(u);
-            return NULL;
-        }
-    }
-#else
-    /*
-     * default is to return buffer object pointing to
-     * current item a view of it
-     */
-    if (PyArray_ISWRITEABLE(ap)) {
-        if (array_might_be_written(ap) < 0) {
-            return NULL;
-        }
-        u = (PyArrayObject *)PyBuffer_FromReadWriteMemory(ip, itemsize);
-    }
-    else {
-        u = (PyArrayObject *)PyBuffer_FromMemory(ip, itemsize);
-    }
-#endif
-
-    if (u == NULL) {
-        return NULL;
-    }
-    return (PyObject *)u;
+    return PyBytes_FromStringAndSize(PyArray_DATA(ap), descr->elsize);
 }
 
 
@@ -809,7 +756,7 @@ NPY_NO_EXPORT int PyArray_CopyObject(PyArrayObject *, PyObject *);
  */
 NPY_NO_EXPORT int
 _setup_field(int i, PyArray_Descr *descr, PyArrayObject *arr,
-            npy_intp *offset_p)
+            npy_intp *offset_p, char *dstdata)
 {
     PyObject *key;
     PyObject *tup;
@@ -823,7 +770,8 @@ _setup_field(int i, PyArray_Descr *descr, PyArrayObject *arr,
     }
 
     ((PyArrayObject_fields *)(arr))->descr = new;
-    if ((new->alignment > 1) && ((offset % new->alignment) != 0)) {
+    if ((new->alignment > 1) && 
+                ((((uintptr_t)dstdata + offset) % new->alignment) != 0)) {
         PyArray_CLEARFLAGS(arr, NPY_ARRAY_ALIGNED);
     }
     else {
@@ -851,7 +799,7 @@ _copy_and_return_void_setitem(PyArray_Descr *dstdescr, char *dstdata,
     if (PyArray_EquivTypes(srcdescr, dstdescr)) {
         for (i = 0; i < names_size; i++) {
             /* neither line can ever fail, in principle */
-            if (_setup_field(i, dstdescr, dummy, &offset)) {
+            if (_setup_field(i, dstdescr, dummy, &offset, dstdata)) {
                 return -1;
             }
             PyArray_DESCR(dummy)->f->copyswap(dstdata + offset,
@@ -921,7 +869,7 @@ VOID_setitem(PyObject *op, void *input, void *vap)
                 PyObject *item;
 
                 /* temporarily make ap have only this field */
-                if (_setup_field(i, descr, ap, &offset) == -1) {
+                if (_setup_field(i, descr, ap, &offset, ip) == -1) {
                     failed = 1;
                     break;
                 }
@@ -943,7 +891,7 @@ VOID_setitem(PyObject *op, void *input, void *vap)
 
             for (i = 0; i < names_size; i++) {
                 /* temporarily make ap have only this field */
-                if (_setup_field(i, descr, ap, &offset) == -1) {
+                if (_setup_field(i, descr, ap, &offset, ip) == -1) {
                     failed = 1;
                     break;
                 }
@@ -987,7 +935,6 @@ VOID_setitem(PyObject *op, void *input, void *vap)
             Py_DECREF(ret);
             return -1;
         }
-        PyArray_UpdateFlags(ret, NPY_ARRAY_UPDATE_ALL);
         res = PyArray_CopyObject(ret, op);
         Py_DECREF(ret);
         return res;
@@ -4256,17 +4203,6 @@ small_correlate(const char * d_, npy_intp dstride,
  *****************************************************************************
  */
 
-
-#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
-/*
- * Disable harmless compiler warning "4116: unnamed type definition in
- * parentheses" which is caused by the _ALIGN macro.
- */
-#if defined(_MSC_VER)
-#pragma warning(disable:4116)
-#endif
-
-
 /**begin repeat
  *
  * #from = VOID, STRING, UNICODE#
diff --git a/numpy/core/src/multiarray/arraytypes.h b/numpy/core/src/multiarray/arraytypes.h
index d1c16cdea..a9469aef7 100644
--- a/numpy/core/src/multiarray/arraytypes.h
+++ b/numpy/core/src/multiarray/arraytypes.h
@@ -3,10 +3,6 @@
 
 #include "common.h"
 
-extern NPY_NO_EXPORT PyArray_Descr LONGLONG_Descr;
-extern NPY_NO_EXPORT PyArray_Descr LONG_Descr;
-extern NPY_NO_EXPORT PyArray_Descr INT_Descr;
-
 NPY_NO_EXPORT int
 set_typeinfo(PyObject *dict);
 
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index e24ac2b57..e47dd81b9 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -100,10 +100,10 @@ PyArray_ArgMax(PyArrayObject *op, int axis, PyArrayObject *out)
     }
 
     if (!out) {
-        rp = (PyArrayObject *)PyArray_New(Py_TYPE(ap), PyArray_NDIM(ap)-1,
-                                          PyArray_DIMS(ap), NPY_INTP,
-                                          NULL, NULL, 0, 0,
-                                          (PyObject *)ap);
+        rp = (PyArrayObject *)PyArray_NewFromDescr(
+                Py_TYPE(ap), PyArray_DescrFromType(NPY_INTP),
+                PyArray_NDIM(ap) - 1, PyArray_DIMS(ap), NULL, NULL,
+                0, (PyObject *)ap);
         if (rp == NULL) {
             goto fail;
         }
@@ -216,10 +216,10 @@ PyArray_ArgMin(PyArrayObject *op, int axis, PyArrayObject *out)
     }
 
     if (!out) {
-        rp = (PyArrayObject *)PyArray_New(Py_TYPE(ap), PyArray_NDIM(ap)-1,
-                                          PyArray_DIMS(ap), NPY_INTP,
-                                          NULL, NULL, 0, 0,
-                                          (PyObject *)ap);
+        rp = (PyArrayObject *)PyArray_NewFromDescr(
+                Py_TYPE(ap), PyArray_DescrFromType(NPY_INTP),
+                PyArray_NDIM(ap) - 1, PyArray_DIMS(ap), NULL, NULL,
+                0, (PyObject *)ap);
         if (rp == NULL) {
             goto fail;
         }
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index 5ee385c46..25e3dcca3 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -273,9 +273,10 @@ arr_digitize(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
         npy_intp stride = -PyArray_STRIDE(arr_bins, 0);
         void *data = (void *)(PyArray_BYTES(arr_bins) - stride * (shape - 1));
 
-        arr_tmp = (PyArrayObject *)PyArray_New(&PyArray_Type, 1, &shape,
-                                               NPY_DOUBLE, &stride, data, 0,
-                                               PyArray_FLAGS(arr_bins), NULL);
+        arr_tmp = (PyArrayObject *)PyArray_NewFromDescr(
+                &PyArray_Type, PyArray_DescrFromType(NPY_DOUBLE),
+                1, &shape, &stride, data,
+                PyArray_FLAGS(arr_bins), NULL);
         if (!arr_tmp) {
             goto fail;
         }
@@ -1362,11 +1363,11 @@ arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
     for (i = 0; i < dimensions.len; ++i) {
         PyArrayObject *view;
 
-        view = (PyArrayObject *)PyArray_New(&PyArray_Type, ret_ndim-1,
-                                ret_dims, NPY_INTP,
-                                ret_strides,
-                                PyArray_BYTES(ret_arr) + i*sizeof(npy_intp),
-                                0, NPY_ARRAY_WRITEABLE, NULL);
+        view = (PyArrayObject *)PyArray_NewFromDescr(
+                &PyArray_Type, PyArray_DescrFromType(NPY_INTP),
+                ret_ndim - 1, ret_dims, ret_strides,
+                PyArray_BYTES(ret_arr) + i*sizeof(npy_intp),
+                NPY_ARRAY_WRITEABLE, NULL);
         if (view == NULL) {
             goto fail;
         }
@@ -1621,8 +1622,10 @@ pack_bits(PyObject *input, int axis)
     if (PyArray_NDIM(new) == 0) {
         char *optr, *iptr;
 
-        out = (PyArrayObject *)PyArray_New(Py_TYPE(new), 0, NULL, NPY_UBYTE,
-                NULL, NULL, 0, 0, NULL);
+        out = (PyArrayObject *)PyArray_NewFromDescr(
+                Py_TYPE(new), PyArray_DescrFromType(NPY_UBYTE),
+                0, NULL, NULL, NULL,
+                0, NULL);
         if (out == NULL) {
             goto fail;
         }
@@ -1652,9 +1655,10 @@ pack_bits(PyObject *input, int axis)
     outdims[axis] = ((outdims[axis] - 1) >> 3) + 1;
 
     /* Create output array */
-    out = (PyArrayObject *)PyArray_New(Py_TYPE(new),
-                        PyArray_NDIM(new), outdims, NPY_UBYTE,
-                        NULL, NULL, 0, PyArray_ISFORTRAN(new), NULL);
+    out = (PyArrayObject *)PyArray_NewFromDescr(
+            Py_TYPE(new), PyArray_DescrFromType(NPY_UBYTE),
+            PyArray_NDIM(new), outdims, NULL, NULL,
+            PyArray_ISFORTRAN(new), NULL);
     if (out == NULL) {
         goto fail;
     }
@@ -1746,9 +1750,10 @@ unpack_bits(PyObject *input, int axis)
     outdims[axis] <<= 3;
 
     /* Create output array */
-    out = (PyArrayObject *)PyArray_New(Py_TYPE(new),
-                        PyArray_NDIM(new), outdims, NPY_UBYTE,
-                        NULL, NULL, 0, PyArray_ISFORTRAN(new), NULL);
+    out = (PyArrayObject *)PyArray_NewFromDescr(
+            Py_TYPE(new), PyArray_DescrFromType(NPY_UBYTE),
+            PyArray_NDIM(new), outdims, NULL, NULL,
+            PyArray_ISFORTRAN(new), NULL);
     if (out == NULL) {
         goto fail;
     }
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 5d3cee647..70f5c72aa 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -1143,8 +1143,8 @@ PyArray_NewFromDescr_int(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
  * true, dtype will be decrefed.
  */
 NPY_NO_EXPORT PyObject *
-PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
-                     npy_intp *dims, npy_intp *strides, void *data,
+PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr,
+                     int nd, npy_intp *dims, npy_intp *strides, void *data,
                      int flags, PyObject *obj)
 {
     return PyArray_NewFromDescr_int(subtype, descr, nd,
@@ -1358,8 +1358,6 @@ _array_from_buffer_3118(PyObject *memoryview)
     if (PyArray_SetBaseObject((PyArrayObject *)r, memoryview) < 0) {
         goto fail;
     }
-    PyArray_UpdateFlags((PyArrayObject *)r, NPY_ARRAY_UPDATE_ALL);
-
     return r;
 
 fail:
@@ -2124,7 +2122,6 @@ PyArray_FromStructInterface(PyObject *input)
         return NULL;
     }
     Py_DECREF(attr);
-    PyArray_UpdateFlags(ret, NPY_ARRAY_UPDATE_ALL);
     return (PyObject *)ret;
 
  fail:
@@ -2999,11 +2996,26 @@ PyArray_Arange(double start, double stop, double step, int type_num)
     PyArray_ArrFuncs *funcs;
     PyObject *obj;
     int ret;
+    double delta, tmp_len;
     NPY_BEGIN_THREADS_DEF;
 
-    length = _arange_safe_ceil_to_intp((stop - start)/step);
-    if (error_converting(length)) {
-        return NULL;
+    delta = stop - start;
+    tmp_len = delta/step;
+
+    /* Underflow and divide-by-inf check */
+    if (tmp_len == 0.0 && delta != 0.0) {
+        if (npy_signbit(tmp_len)) {
+            length = 0;
+        }
+        else {
+            length = 1;
+        }
+    }
+    else {
+        length = _arange_safe_ceil_to_intp(tmp_len);
+        if (error_converting(length)) {
+            return NULL;
+        }
     }
 
     if (length <= 0) {
@@ -3067,7 +3079,8 @@ static npy_intp
 _calc_length(PyObject *start, PyObject *stop, PyObject *step, PyObject **next, int cmplx)
 {
     npy_intp len, tmp;
-    PyObject *val;
+    PyObject *zero, *val;
+    int next_is_nonzero, val_is_zero;
     double value;
 
     *next = PyNumber_Subtract(stop, start);
@@ -3080,12 +3093,37 @@ _calc_length(PyObject *start, PyObject *stop, PyObject *step, PyObject **next, i
         }
         return -1;
     }
+
+    zero = PyInt_FromLong(0);
+    if (!zero) {
+        Py_DECREF(*next);
+        *next = NULL;
+        return -1;
+    }
+
+    next_is_nonzero = PyObject_RichCompareBool(*next, zero, Py_NE);
+    if (next_is_nonzero == -1) {
+        Py_DECREF(zero);
+        Py_DECREF(*next);
+        *next = NULL;
+        return -1;
+    }
     val = PyNumber_TrueDivide(*next, step);
     Py_DECREF(*next);
     *next = NULL;
+
     if (!val) {
+        Py_DECREF(zero);
         return -1;
     }
+
+    val_is_zero = PyObject_RichCompareBool(val, zero, Py_EQ);
+    Py_DECREF(zero);
+    if (val_is_zero == -1) {
+        Py_DECREF(val);
+        return -1;
+    }
+
     if (cmplx && PyComplex_Check(val)) {
         value = PyComplex_RealAsDouble(val);
         if (error_converting(value)) {
@@ -3114,11 +3152,24 @@ _calc_length(PyObject *start, PyObject *stop, PyObject *step, PyObject **next, i
         if (error_converting(value)) {
             return -1;
         }
-        len = _arange_safe_ceil_to_intp(value);
-        if (error_converting(len)) {
-            return -1;
+
+        /* Underflow and divide-by-inf check */
+        if (val_is_zero && next_is_nonzero) {
+            if (npy_signbit(value)) {
+                len = 0;
+            }
+            else {
+                len = 1;
+            }
+        }
+        else {
+            len = _arange_safe_ceil_to_intp(value);
+            if (error_converting(len)) {
+                return -1;
+            }
         }
     }
+
     if (len > 0) {
         *next = PyNumber_Add(start, step);
         if (!*next) {
@@ -3633,7 +3684,6 @@ PyArray_FromBuffer(PyObject *buf, PyArray_Descr *type,
         Py_DECREF(ret);
         return NULL;
     }
-    PyArray_UpdateFlags(ret, NPY_ARRAY_ALIGNED);
     return (PyObject *)ret;
 }
 
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index a4a028ad4..af542aecc 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -2808,9 +2808,12 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
                 us_meta.base = NPY_FR_m;
             }
             else if (td % (24*60*60*1000000LL) != 0) {
-                us_meta.base = NPY_FR_D;
+                us_meta.base = NPY_FR_h;
             }
             else if (td % (7*24*60*60*1000000LL) != 0) {
+                us_meta.base = NPY_FR_D;
+            }
+            else {
                 us_meta.base = NPY_FR_W;
             }
             us_meta.num = 1;
@@ -3679,11 +3682,11 @@ recursive_find_object_datetime64_type(PyObject *obj,
 
         return 0;
     }
-    /* Python date object -> 'D' */
-    else if (PyDate_Check(obj)) {
+    /* Python datetime object -> 'us' */
+    else if (PyDateTime_Check(obj)) {
         PyArray_DatetimeMetaData tmp_meta;
 
-        tmp_meta.base = NPY_FR_D;
+        tmp_meta.base = NPY_FR_us;
         tmp_meta.num = 1;
 
         /* Combine it with 'meta' */
@@ -3694,11 +3697,11 @@ recursive_find_object_datetime64_type(PyObject *obj,
 
         return 0;
     }
-    /* Python datetime object -> 'us' */
-    else if (PyDateTime_Check(obj)) {
+    /* Python date object -> 'D' */
+    else if (PyDate_Check(obj)) {
         PyArray_DatetimeMetaData tmp_meta;
 
-        tmp_meta.base = NPY_FR_us;
+        tmp_meta.base = NPY_FR_D;
         tmp_meta.num = 1;
 
         /* Combine it with 'meta' */
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index c1c1ce568..bb3cc9d4e 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -18,6 +18,7 @@
 #include "templ_common.h" /* for npy_mul_with_overflow_intp */
 #include "descriptor.h"
 #include "alloc.h"
+#include "assert.h"
 
 /*
  * offset:    A starting offset.
@@ -1938,33 +1939,26 @@ arraydescr_shape_get(PyArray_Descr *self)
     if (!PyDataType_HASSUBARRAY(self)) {
         return PyTuple_New(0);
     }
-    /*TODO
-     * self->subarray->shape should always be a tuple,
-     * so this check should be unnecessary
-     */
-    if (PyTuple_Check(self->subarray->shape)) {
-        Py_INCREF(self->subarray->shape);
-        return (PyObject *)(self->subarray->shape);
-    }
-    return Py_BuildValue("(O)", self->subarray->shape);
+    assert(PyTuple_Check(self->subarray->shape));
+    Py_INCREF(self->subarray->shape);
+    return self->subarray->shape;
 }
 
 static PyObject *
 arraydescr_ndim_get(PyArray_Descr *self)
 {
+    Py_ssize_t ndim;
+
     if (!PyDataType_HASSUBARRAY(self)) {
         return PyInt_FromLong(0);
     }
-    /*TODO
-     * self->subarray->shape should always be a tuple,
-     * so this check should be unnecessary
+
+    /*
+     * PyTuple_Size has built in check
+     * for tuple argument
      */
-    if (PyTuple_Check(self->subarray->shape)) {
-        Py_ssize_t ndim = PyTuple_Size(self->subarray->shape);
-        return PyInt_FromLong(ndim);
-    }
-    /* consistent with arraydescr_shape_get */
-    return PyInt_FromLong(1);
+    ndim = PyTuple_Size(self->subarray->shape);
+    return PyInt_FromLong(ndim);
 }
 
 
diff --git a/numpy/core/src/multiarray/dragon4.c b/numpy/core/src/multiarray/dragon4.c
index e005234a0..c14653ac5 100644
--- a/numpy/core/src/multiarray/dragon4.c
+++ b/numpy/core/src/multiarray/dragon4.c
@@ -42,6 +42,18 @@
 #define DEBUG_ASSERT(stmnt) do {} while(0)
 #endif
 
+static inline npy_uint64
+bitmask_u64(npy_uint32 n)
+{
+    return ~(~((npy_uint64)0) << n);
+}
+
+static inline npy_uint32
+bitmask_u32(npy_uint32 n)
+{
+    return ~(~((npy_uint32)0) << n);
+}
+
 /*
  *  Get the log base 2 of a 32-bit unsigned integer.
  *  http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogLookup
@@ -102,6 +114,17 @@ LogBase2_64(npy_uint64 val)
     return LogBase2_32((npy_uint32)val);
 }
 
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
+static npy_uint32
+LogBase2_128(npy_uint64 hi, npy_uint64 lo)
+{
+    if (hi) {
+        return 64 + LogBase2_64(hi);
+    }
+
+    return LogBase2_64(lo);
+}
+#endif /* HAVE_LDOUBLE_IEEE_QUAD_LE */
 
 /*
  * Maximum number of 32 bit blocks needed in high precision arithmetic to print
@@ -122,6 +145,45 @@ typedef struct BigInt {
     npy_uint32 blocks[c_BigInt_MaxBlocks];
 } BigInt;
 
+/*
+ * Dummy implementation of a memory manager for BigInts. Currently, only
+ * supports a single call to Dragon4, but that is OK because Dragon4
+ * does not release the GIL.
+ *
+ * We try to raise an error anyway if dragon4 re-enters, and this code serves
+ * as a placeholder if we want to make it re-entrant in the future.
+ *
+ * Each call to dragon4 uses 7 BigInts.
+ */
+#define BIGINT_DRAGON4_GROUPSIZE 7
+typedef struct {
+    BigInt bigints[BIGINT_DRAGON4_GROUPSIZE];
+    char repr[16384];
+} Dragon4_Scratch;
+
+static int _bigint_static_in_use = 0;
+static Dragon4_Scratch _bigint_static;
+
+static Dragon4_Scratch*
+get_dragon4_bigint_scratch(void) {
+    /* this test+set is not threadsafe, but no matter because we have GIL */
+    if (_bigint_static_in_use) {
+        PyErr_SetString(PyExc_RuntimeError,
+            "numpy float printing code is not re-entrant. "
+            "Ping the devs to fix it.");
+        return NULL;
+    }
+    _bigint_static_in_use = 1;
+
+    /* in this dummy implementation we only return the static allocation */
+    return &_bigint_static;
+}
+
+static void
+free_dragon4_bigint_scratch(Dragon4_Scratch *mem){
+    _bigint_static_in_use = 0;
+}
+
 /* Copy integer */
 static void
 BigInt_Copy(BigInt *dst, const BigInt *src)
@@ -139,26 +201,62 @@ BigInt_Copy(BigInt *dst, const BigInt *src)
 static void
 BigInt_Set_uint64(BigInt *i, npy_uint64 val)
 {
-    if (val > 0xFFFFFFFF) {
-        i->blocks[0] = val & 0xFFFFFFFF;
-        i->blocks[1] = (val >> 32) & 0xFFFFFFFF;
+    if (val > bitmask_u64(32)) {
+        i->blocks[0] = val & bitmask_u64(32);
+        i->blocks[1] = (val >> 32) & bitmask_u64(32);
         i->length = 2;
     }
     else if (val != 0) {
-        i->blocks[0] = val & 0xFFFFFFFF;
+        i->blocks[0] = val & bitmask_u64(32);
+        i->length = 1;
+    }
+    else {
+        i->length = 0;
+    }
+}
+
+#if (defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE) || \
+     defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE) || \
+     defined(HAVE_LDOUBLE_IEEE_QUAD_LE))
+static void
+BigInt_Set_2x_uint64(BigInt *i, npy_uint64 hi, npy_uint64 lo)
+{
+    if (hi > bitmask_u64(32)) {
+        i->length = 4;
+    }
+    else if (hi != 0) {
+        i->length = 3;
+    }
+    else if (lo > bitmask_u64(32)) {
+        i->length = 2;
+    }
+    else if (lo != 0) {
         i->length = 1;
     }
     else {
         i->length = 0;
     }
+
+    /* Note deliberate fallthrough in this switch */
+    switch (i->length) {
+        case 4:
+            i->blocks[3] = (hi >> 32) & bitmask_u64(32);
+        case 3:
+            i->blocks[2] = hi & bitmask_u64(32);
+        case 2:
+            i->blocks[1] = (lo >> 32) & bitmask_u64(32);
+        case 1:
+            i->blocks[0] = lo & bitmask_u64(32);
+    }
 }
+#endif /* DOUBLE_DOUBLE and QUAD */
 
 static void
 BigInt_Set_uint32(BigInt *i, npy_uint32 val)
 {
     if (val != 0) {
         i->blocks[0] = val;
-        i->length = (val != 0);
+        i->length = 1;
     }
     else {
         i->length = 0;
@@ -166,6 +264,24 @@ BigInt_Set_uint32(BigInt *i, npy_uint32 val)
 }
 
 /*
+ * Returns 1 if the value is zero
+ */
+static int
+BigInt_IsZero(const BigInt *i)
+{
+    return i->length == 0;
+}
+
+/*
+ * Returns 1 if the value is even
+ */
+static int
+BigInt_IsEven(const BigInt *i)
+{
+    return (i->length == 0) || ( (i->blocks[0] % 2) == 0);
+}
+
+/*
  * Returns 0 if (lhs = rhs), negative if (lhs < rhs), positive if (lhs > rhs)
  */
 static npy_int32
@@ -228,7 +344,7 @@ BigInt_Add(BigInt *result, const BigInt *lhs, const BigInt *rhs)
         npy_uint64 sum = carry + (npy_uint64)(*largeCur) +
                                  (npy_uint64)(*smallCur);
         carry = sum >> 32;
-        *resultCur = sum & 0xFFFFFFFF;
+        *resultCur = sum & bitmask_u64(32);
         ++largeCur;
         ++smallCur;
         ++resultCur;
@@ -238,7 +354,7 @@ BigInt_Add(BigInt *result, const BigInt *lhs, const BigInt *rhs)
     while (largeCur != largeEnd) {
         npy_uint64 sum = carry + (npy_uint64)(*largeCur);
         carry = sum >> 32;
-        (*resultCur) = sum & 0xFFFFFFFF;
+        (*resultCur) = sum & bitmask_u64(32);
         ++largeCur;
         ++resultCur;
     }
@@ -307,13 +423,13 @@ BigInt_Multiply(BigInt *result, const BigInt *lhs, const BigInt *rhs)
                 npy_uint64 product = (*resultCur) +
                                      (*largeCur)*(npy_uint64)multiplier + carry;
                 carry = product >> 32;
-                *resultCur = product & 0xFFFFFFFF;
+                *resultCur = product & bitmask_u64(32);
                 ++largeCur;
                 ++resultCur;
             } while(largeCur != large->blocks + large->length);
 
             DEBUG_ASSERT(resultCur < result->blocks + maxResultLen);
-            *resultCur = (npy_uint32)(carry & 0xFFFFFFFF);
+            *resultCur = (npy_uint32)(carry & bitmask_u64(32));
         }
     }
 
@@ -337,7 +453,7 @@ BigInt_Multiply_int(BigInt *result, const BigInt *lhs, npy_uint32 rhs)
     const npy_uint32 *pLhsEnd = lhs->blocks + lhs->length;
     for ( ; pLhsCur != pLhsEnd; ++pLhsCur, ++resultCur) {
         npy_uint64 product = (npy_uint64)(*pLhsCur) * rhs + carry;
-        *resultCur = (npy_uint32)(product & 0xFFFFFFFF);
+        *resultCur = (npy_uint32)(product & bitmask_u64(32));
         carry = product >> 32;
     }
 
@@ -414,7 +530,7 @@ BigInt_Multiply10(BigInt *result)
     npy_uint32 *end = result->blocks + result->length;
     for ( ; cur != end; ++cur) {
         npy_uint64 product = (npy_uint64)(*cur) * 10ull + carry;
-        (*cur) = (npy_uint32)(product & 0xFFFFFFFF);
+        (*cur) = (npy_uint32)(product & bitmask_u64(32));
         carry = product >> 32;
     }
 
@@ -637,13 +753,11 @@ static BigInt g_PowerOf10_Big[] =
 
 /* result = 10^exponent */
 static void
-BigInt_Pow10(BigInt *result, npy_uint32 exponent)
+BigInt_Pow10(BigInt *result, npy_uint32 exponent, BigInt *temp)
 {
-    /* create two temporary values to reduce large integer copy operations */
-    BigInt temp1;
-    BigInt temp2;
-    BigInt *curTemp = &temp1;
-    BigInt *pNextTemp = &temp2;
+    /* use two temporary values to reduce large integer copy operations */
+    BigInt *curTemp = result;
+    BigInt *pNextTemp = temp;
     npy_uint32 smallExponent;
     npy_uint32 tableIdx = 0;
 
@@ -654,7 +768,7 @@ BigInt_Pow10(BigInt *result, npy_uint32 exponent)
      * initialize the result by looking up a 32-bit power of 10 corresponding to
      * the first 3 bits
      */
-    smallExponent = exponent & 0x7;
+    smallExponent = exponent & bitmask_u32(3);
     BigInt_Set_uint32(curTemp, g_PowerOf10_U32[smallExponent]);
 
     /* remove the low bits that we used for the 32-bit lookup table */
@@ -681,19 +795,17 @@ BigInt_Pow10(BigInt *result, npy_uint32 exponent)
     }
 
     /* output the result */
-    BigInt_Copy(result, curTemp);
+    if (curTemp != result) {
+        BigInt_Copy(result, curTemp);
+    }
 }
 
-/* result = in * 10^exponent */
+/* in = in * 10^exponent */
 static void
-BigInt_MultiplyPow10(BigInt *result, const BigInt *in, npy_uint32 exponent)
+BigInt_MultiplyPow10(BigInt *in, npy_uint32 exponent, BigInt *temp)
 {
-
-    /* create two temporary values to reduce large integer copy operations */
-    BigInt temp1;
-    BigInt temp2;
-    BigInt *curTemp = &temp1;
-    BigInt *pNextTemp = &temp2;
+    /* use two temporary values to reduce large integer copy operations */
+    BigInt *curTemp, *pNextTemp;
     npy_uint32 smallExponent;
     npy_uint32 tableIdx = 0;
 
@@ -704,12 +816,15 @@ BigInt_MultiplyPow10(BigInt *result, const BigInt *in, npy_uint32 exponent)
      * initialize the result by looking up a 32-bit power of 10 corresponding to
      * the first 3 bits
      */
-    smallExponent = exponent & 0x7;
+    smallExponent = exponent & bitmask_u32(3);
     if (smallExponent != 0) {
-        BigInt_Multiply_int(curTemp, in, g_PowerOf10_U32[smallExponent]);
+        BigInt_Multiply_int(temp, in, g_PowerOf10_U32[smallExponent]);
+        curTemp = temp;
+        pNextTemp = in;
     }
     else {
-        BigInt_Copy(curTemp, in);
+        curTemp = in;
+        pNextTemp = temp;
     }
 
     /* remove the low bits that we used for the 32-bit lookup table */
@@ -724,7 +839,7 @@ BigInt_MultiplyPow10(BigInt *result, const BigInt *in, npy_uint32 exponent)
             /* multiply into the next temporary */
             BigInt_Multiply(pNextTemp, curTemp, &g_PowerOf10_Big[tableIdx]);
 
-            // swap to the next temporary
+            /* swap to the next temporary */
             pSwap = curTemp;
             curTemp = pNextTemp;
             pNextTemp = pSwap;
@@ -736,7 +851,9 @@ BigInt_MultiplyPow10(BigInt *result, const BigInt *in, npy_uint32 exponent)
     }
 
     /* output the result */
-    BigInt_Copy(result, curTemp);
+    if (curTemp != in){
+        BigInt_Copy(in, curTemp);
+    }
 }
 
 /* result = 2^exponent */
@@ -788,7 +905,7 @@ BigInt_DivideWithRemainder_MaxQuotient9(BigInt *dividend, const BigInt *divisor)
      */
     DEBUG_ASSERT(!divisor->length == 0 &&
                 divisor->blocks[divisor->length-1] >= 8 &&
-                divisor->blocks[divisor->length-1] < 0xFFFFFFFF &&
+                divisor->blocks[divisor->length-1] < bitmask_u64(32) &&
                 dividend->length <= divisor->length);
 
     /*
@@ -825,10 +942,10 @@ BigInt_DivideWithRemainder_MaxQuotient9(BigInt *dividend, const BigInt *divisor)
             carry = product >> 32;
 
             difference = (npy_uint64)*dividendCur
-                       - (product & 0xFFFFFFFF) - borrow;
+                       - (product & bitmask_u64(32)) - borrow;
             borrow = (difference >> 32) & 1;
 
-            *dividendCur = difference & 0xFFFFFFFF;
+            *dividendCur = difference & bitmask_u64(32);
 
             ++divisorCur;
             ++dividendCur;
@@ -860,7 +977,7 @@ BigInt_DivideWithRemainder_MaxQuotient9(BigInt *dividend, const BigInt *divisor)
                                   - (npy_uint64)*divisorCur - borrow;
             borrow = (difference >> 32) & 1;
 
-            *dividendCur = difference & 0xFFFFFFFF;
+            *dividendCur = difference & bitmask_u64(32);
 
             ++divisorCur;
             ++dividendCur;
@@ -993,12 +1110,20 @@ BigInt_ShiftLeft(BigInt *result, npy_uint32 shift)
  * There is some more documentation of these changes on Ryan Juckett's website
  * at http://www.ryanjuckett.com/programming/printing-floating-point-numbers/
  *
- * Ryan Juckett's implementation did not implement "IEEE unbiased rounding",
- * except in the last digit. This has been added back, following the Burger &
- * Dybvig code, using the isEven variable.
+ * This code also has a few implementation differences from Ryan Juckett's
+ * version:
+ *  1. fixed overflow problems when mantissa was 64 bits (in float128 types),
+ *     by replacing multiplication by 2 or 4 by BigInt_ShiftLeft calls.
+ *  2. Increased c_BigInt_MaxBlocks, for 128-bit floats
+ *  3. Added more entries to the g_PowerOf10_Big table, for 128-bit floats.
+ *  4. Added unbiased rounding calculation with isEven. Ryan Juckett's
+ *     implementation did not implement "IEEE unbiased rounding", except in the
+ *     last digit. This has been added back, following the Burger & Dybvig
+ *     code, using the isEven variable.
  *
  * Arguments:
- *   * mantissa - value significand
+ *   * bigints - memory to store all bigints needed (7) for dragon4 computation.
+ *               The first BigInt should be filled in with the mantissa.
  *   * exponent - value exponent in base 2
  *   * mantissaBit - index of the highest set mantissa bit
  *   * hasUnequalMargins - is the high margin twice as large as the low margin
@@ -1007,9 +1132,11 @@ BigInt_ShiftLeft(BigInt *result, npy_uint32 shift)
  *   * pOutBuffer - buffer to output into
  *   * bufferSize - maximum characters that can be printed to pOutBuffer
  *   * pOutExponent - the base 10 exponent of the first digit
+ *
+ * Returns the number of digits written to the output buffer.
  */
 static npy_uint32
-Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
+Dragon4(BigInt *bigints, const npy_int32 exponent,
         const npy_uint32 mantissaBit, const npy_bool hasUnequalMargins,
         const DigitMode digitMode, const CutoffMode cutoffMode,
         npy_int32 cutoffNumber, char *pOutBuffer,
@@ -1025,21 +1152,24 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
      * Here, marginLow and marginHigh represent 1/2 of the distance to the next
      * floating point value above/below the mantissa.
      *
-     * scaledMarginHigh is a pointer so that it can point to scaledMarginLow in
-     * the case they must be equal to each other, otherwise it will point to
-     * optionalMarginHigh.
+     * scaledMarginHigh will point to scaledMarginLow in the case they must be
+     * equal to each other, otherwise it will point to optionalMarginHigh.
      */
-    BigInt scale;
-    BigInt scaledValue;
-    BigInt scaledMarginLow;
+    BigInt *mantissa = &bigints[0];  /* the only initialized bigint */
+    BigInt *scale = &bigints[1];
+    BigInt *scaledValue = &bigints[2];
+    BigInt *scaledMarginLow = &bigints[3];
     BigInt *scaledMarginHigh;
-    BigInt optionalMarginHigh;
+    BigInt *optionalMarginHigh = &bigints[4];
+
+    BigInt *temp1 = &bigints[5];
+    BigInt *temp2 = &bigints[6];
 
     const npy_float64 log10_2 = 0.30102999566398119521373889472449;
     npy_int32 digitExponent, cutoffExponent, hiBlock;
     npy_uint32 outputDigit;    /* current digit being output */
     npy_uint32 outputLen;
-    npy_bool isEven = (mantissa % 2) == 0;
+    npy_bool isEven = BigInt_IsEven(mantissa);
     npy_int32 cmp;
 
     /* values used to determine how to round */
@@ -1048,12 +1178,14 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
     DEBUG_ASSERT(bufferSize > 0);
 
     /* if the mantissa is zero, the value is zero regardless of the exponent */
-    if (mantissa == 0) {
+    if (BigInt_IsZero(mantissa)) {
         *curDigit = '0';
         *pOutExponent = 0;
         return 1;
     }
 
+    BigInt_Copy(scaledValue, mantissa);
+
     if (hasUnequalMargins) {
         /* if we have no fractional component */
         if (exponent > 0) {
@@ -1067,17 +1199,13 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
              */
 
             /* scaledValue      = 2 * 2 * mantissa*2^exponent */
-            BigInt_Set_uint64(&scaledValue, mantissa);
-            BigInt_ShiftLeft(&scaledValue, exponent + 2);
-
+            BigInt_ShiftLeft(scaledValue, exponent + 2);
             /* scale            = 2 * 2 * 1 */
-            BigInt_Set_uint32(&scale,  4);
-
+            BigInt_Set_uint32(scale,  4);
             /* scaledMarginLow  = 2 * 2^(exponent-1) */
-            BigInt_Pow2(&scaledMarginLow, exponent);
-
+            BigInt_Pow2(scaledMarginLow, exponent);
             /* scaledMarginHigh = 2 * 2 * 2^(exponent-1) */
-            BigInt_Pow2(&optionalMarginHigh, exponent + 1);
+            BigInt_Pow2(optionalMarginHigh, exponent + 1);
         }
         /* else we have a fractional exponent */
         else {
@@ -1087,34 +1215,27 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
              */
 
             /* scaledValue      = 2 * 2 * mantissa */
-            BigInt_Set_uint64(&scaledValue, mantissa);
-            BigInt_ShiftLeft(&scaledValue, 2);
-
+            BigInt_ShiftLeft(scaledValue, 2);
             /* scale            = 2 * 2 * 2^(-exponent) */
-            BigInt_Pow2(&scale, -exponent + 2);
-
+            BigInt_Pow2(scale, -exponent + 2);
             /* scaledMarginLow  = 2 * 2^(-1) */
-            BigInt_Set_uint32(&scaledMarginLow, 1);
-
+            BigInt_Set_uint32(scaledMarginLow, 1);
             /* scaledMarginHigh = 2 * 2 * 2^(-1) */
-            BigInt_Set_uint32(&optionalMarginHigh, 2);
+            BigInt_Set_uint32(optionalMarginHigh, 2);
         }
 
         /* the high and low margins are different */
-        scaledMarginHigh = &optionalMarginHigh;
+        scaledMarginHigh = optionalMarginHigh;
     }
     else {
         /* if we have no fractional component */
         if (exponent > 0) {
             /* scaledValue     = 2 * mantissa*2^exponent */
-            BigInt_Set_uint64(&scaledValue, mantissa);
-            BigInt_ShiftLeft(&scaledValue, exponent + 1);
-
+            BigInt_ShiftLeft(scaledValue, exponent + 1);
             /* scale           = 2 * 1 */
-            BigInt_Set_uint32(&scale, 2);
-
+            BigInt_Set_uint32(scale, 2);
             /* scaledMarginLow = 2 * 2^(exponent-1) */
-            BigInt_Pow2(&scaledMarginLow, exponent);
+            BigInt_Pow2(scaledMarginLow, exponent);
         }
         /* else we have a fractional exponent */
         else {
@@ -1124,18 +1245,15 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
              */
 
             /* scaledValue     = 2 * mantissa */
-            BigInt_Set_uint64(&scaledValue, mantissa);
-            BigInt_ShiftLeft(&scaledValue, 1);
-
+            BigInt_ShiftLeft(scaledValue, 1);
             /* scale           = 2 * 2^(-exponent) */
-            BigInt_Pow2(&scale, -exponent + 1);
-
+            BigInt_Pow2(scale, -exponent + 1);
             /* scaledMarginLow = 2 * 2^(-1) */
-            BigInt_Set_uint32(&scaledMarginLow, 1);
+            BigInt_Set_uint32(scaledMarginLow, 1);
         }
 
         /* the high and low margins are equal */
-        scaledMarginHigh = &scaledMarginLow;
+        scaledMarginHigh = scaledMarginLow;
     }
 
     /*
@@ -1158,6 +1276,9 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
      *                                                 <= log10(v) + log10(2)
      *  floor(log10(v)) < ceil((mantissaBit + exponent) * log10(2))
      *                                                 <= floor(log10(v)) + 1
+     *
+     *  Warning: This calculation assumes npy_float64 is an IEEE-binary64
+     *  float. This line may need to be updated if this is not the case.
      */
     digitExponent = (npy_int32)(
        ceil((npy_float64)((npy_int32)mantissaBit + exponent) * log10_2 - 0.69));
@@ -1179,31 +1300,29 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
     /* Divide value by 10^digitExponent. */
     if (digitExponent > 0) {
         /* A positive exponent creates a division so we multiply the scale. */
-        BigInt temp;
-        BigInt_MultiplyPow10(&temp, &scale, digitExponent);
-        BigInt_Copy(&scale, &temp);
+        BigInt_MultiplyPow10(scale, digitExponent, temp1);
     }
     else if (digitExponent < 0) {
         /*
          * A negative exponent creates a multiplication so we multiply up the
          * scaledValue, scaledMarginLow and scaledMarginHigh.
          */
-        BigInt pow10, temp;
-        BigInt_Pow10(&pow10, -digitExponent);
+        BigInt *temp=temp1, *pow10=temp2;
+        BigInt_Pow10(pow10, -digitExponent, temp);
 
-        BigInt_Multiply(&temp, &scaledValue, &pow10);
-        BigInt_Copy(&scaledValue, &temp);
+        BigInt_Multiply(temp, scaledValue, pow10);
+        BigInt_Copy(scaledValue, temp);
 
-        BigInt_Multiply(&temp, &scaledMarginLow, &pow10);
-        BigInt_Copy(&scaledMarginLow, &temp);
+        BigInt_Multiply(temp, scaledMarginLow, pow10);
+        BigInt_Copy(scaledMarginLow, temp);
 
-        if (scaledMarginHigh != &scaledMarginLow) {
-            BigInt_Multiply2(scaledMarginHigh, &scaledMarginLow);
+        if (scaledMarginHigh != scaledMarginLow) {
+            BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
         }
     }
 
     /* If (value >= 1), our estimate for digitExponent was too low */
-    if (BigInt_Compare(&scaledValue, &scale) >= 0) {
+    if (BigInt_Compare(scaledValue, scale) >= 0) {
         /*
          * The exponent estimate was incorrect.
          * Increment the exponent and don't perform the premultiply needed
@@ -1217,10 +1336,10 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
          * Multiply larger by the output base to prepare for the first loop
          * iteration.
          */
-        BigInt_Multiply10(&scaledValue);
-        BigInt_Multiply10(&scaledMarginLow);
-        if (scaledMarginHigh != &scaledMarginLow) {
-            BigInt_Multiply2(scaledMarginHigh, &scaledMarginLow);
+        BigInt_Multiply10(scaledValue);
+        BigInt_Multiply10(scaledMarginLow);
+        if (scaledMarginHigh != scaledMarginLow) {
+            BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
         }
     }
 
@@ -1261,8 +1380,8 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
      * to be less than or equal to 429496729 which is the highest number that
      * can be multiplied by 10 without overflowing to a new block.
      */
-    DEBUG_ASSERT(scale.length > 0);
-    hiBlock = scale.blocks[scale.length - 1];
+    DEBUG_ASSERT(scale->length > 0);
+    hiBlock = scale->blocks[scale->length - 1];
     if (hiBlock < 8 || hiBlock > 429496729) {
         npy_uint32 hiBlockLog2, shift;
 
@@ -1280,11 +1399,11 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
         DEBUG_ASSERT(hiBlockLog2 < 3 || hiBlockLog2 > 27);
         shift = (32 + 27 - hiBlockLog2) % 32;
 
-        BigInt_ShiftLeft(&scale, shift);
-        BigInt_ShiftLeft(&scaledValue, shift);
-        BigInt_ShiftLeft(&scaledMarginLow, shift);
-        if (scaledMarginHigh != &scaledMarginLow) {
-            BigInt_Multiply2(scaledMarginHigh, &scaledMarginLow);
+        BigInt_ShiftLeft(scale, shift);
+        BigInt_ShiftLeft(scaledValue, shift);
+        BigInt_ShiftLeft(scaledMarginLow, shift);
+        if (scaledMarginHigh != scaledMarginLow) {
+            BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
         }
     }
 
@@ -1296,25 +1415,25 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
          * terminate early.
          */
         for (;;) {
-            BigInt scaledValueHigh;
+            BigInt *scaledValueHigh = temp1;
 
             digitExponent = digitExponent-1;
 
             /* divide out the scale to extract the digit */
             outputDigit =
-                BigInt_DivideWithRemainder_MaxQuotient9(&scaledValue, &scale);
+                BigInt_DivideWithRemainder_MaxQuotient9(scaledValue, scale);
             DEBUG_ASSERT(outputDigit < 10);
 
             /* update the high end of the value */
-            BigInt_Add(&scaledValueHigh, &scaledValue, scaledMarginHigh);
+            BigInt_Add(scaledValueHigh, scaledValue, scaledMarginHigh);
 
             /*
              * stop looping if we are far enough away from our neighboring
              * values or if we have reached the cutoff digit
              */
-            cmp = BigInt_Compare(&scaledValue, &scaledMarginLow);
+            cmp = BigInt_Compare(scaledValue, scaledMarginLow);
             low = isEven ? (cmp <= 0) : (cmp < 0);
-            cmp = BigInt_Compare(&scaledValueHigh, &scale);
+            cmp = BigInt_Compare(scaledValueHigh, scale);
             high = isEven ? (cmp >= 0) : (cmp > 0);
             if (low | high | (digitExponent == cutoffExponent))
                 break;
@@ -1324,10 +1443,10 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
             ++curDigit;
 
             /* multiply larger by the output base */
-            BigInt_Multiply10(&scaledValue);
-            BigInt_Multiply10(&scaledMarginLow);
-            if (scaledMarginHigh != &scaledMarginLow) {
-                BigInt_Multiply2(scaledMarginHigh, &scaledMarginLow);
+            BigInt_Multiply10(scaledValue);
+            BigInt_Multiply10(scaledMarginLow);
+            if (scaledMarginHigh != scaledMarginLow) {
+                BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
             }
         }
     }
@@ -1345,10 +1464,11 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
 
             /* divide out the scale to extract the digit */
             outputDigit =
-                BigInt_DivideWithRemainder_MaxQuotient9(&scaledValue, &scale);
+                BigInt_DivideWithRemainder_MaxQuotient9(scaledValue, scale);
             DEBUG_ASSERT(outputDigit < 10);
 
-            if ((scaledValue.length == 0) | (digitExponent == cutoffExponent)) {
+            if ((scaledValue->length == 0) |
+                    (digitExponent == cutoffExponent)) {
                 break;
             }
 
@@ -1357,7 +1477,7 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
             ++curDigit;
 
             /* multiply larger by the output base */
-            BigInt_Multiply10(&scaledValue);
+            BigInt_Multiply10(scaledValue);
         }
     }
 
@@ -1375,8 +1495,8 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
          *  compare( scale * value, scale * 0.5 )
          *  compare( 2 * scale * value, scale )
          */
-        BigInt_Multiply2_inplace(&scaledValue);
-        compare = BigInt_Compare(&scaledValue, &scale);
+        BigInt_Multiply2_inplace(scaledValue);
+        compare = BigInt_Compare(scaledValue, scale);
         roundDown = compare < 0;
 
         /*
@@ -1431,134 +1551,53 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
 
 
 /*
- * Helper union to decompose a 16-bit IEEE float.
- * sign:      1 bit
- * exponent:  5 bits
- * mantissa: 10 bits
- */
-typedef union FloatUnion16
-{
-    npy_uint16 integer;
-} FloatUnion16;
-
-npy_bool   IsNegative_F16(FloatUnion16 *v) { return (v->integer >> 15) != 0; }
-npy_uint32 GetExponent_F16(FloatUnion16 *v) { return (v->integer >> 10) & 0x1F;}
-npy_uint32 GetMantissa_F16(FloatUnion16 *v) { return v->integer & 0x3FF; }
-
-
-/*
- * Helper union to decompose a 32-bit IEEE float.
- * sign:      1 bit
- * exponent:  8 bits
- * mantissa: 23 bits
- */
-typedef union FloatUnion32
-{
-    npy_float32 floatingPoint;
-    npy_uint32 integer;
-} FloatUnion32;
-
-npy_bool   IsNegative_F32(FloatUnion32 *v) { return (v->integer >> 31) != 0; }
-npy_uint32 GetExponent_F32(FloatUnion32 *v) { return (v->integer >> 23) & 0xFF;}
-npy_uint32 GetMantissa_F32(FloatUnion32 *v) { return v->integer & 0x7FFFFF; }
-
-/*
- * Helper union to decompose a 64-bit IEEE float.
- * sign:      1 bit
- * exponent: 11 bits
- * mantissa: 52 bits
- */
-typedef union FloatUnion64
-{
-    npy_float64 floatingPoint;
-    npy_uint64 integer;
-} FloatUnion64;
-npy_bool   IsNegative_F64(FloatUnion64 *v) { return (v->integer >> 63) != 0; }
-npy_uint32 GetExponent_F64(FloatUnion64 *v) { return (v->integer >> 52) & 0x7FF; }
-npy_uint64 GetMantissa_F64(FloatUnion64 *v) { return v->integer & 0xFFFFFFFFFFFFFull; }
-
-/*
- * Helper unions and datatype to decompose a 80-bit IEEE float
- * sign:      1 bit,  second u64
- * exponent: 15 bits, second u64
- * intbit     1 bit,  first u64
- * mantissa: 63 bits, first u64
- */
-
-/*
- * Since systems have different types of long doubles, and may not necessarily
- * have a 128-byte format we can use to pass values around, here we create
- * our own 128-bit storage type for convenience.
- */
-typedef struct FloatVal128 {
-    npy_uint64 integer[2];
-} FloatVal128;
-npy_bool   IsNegative_F128(FloatVal128 *v) {
-    return ((v->integer[1] >> 15) & 0x1) != 0;
-}
-npy_uint32 GetExponent_F128(FloatVal128 *v) { return v->integer[1] & 0x7FFF; }
-npy_uint64 GetMantissa_F128(FloatVal128 *v) {
-    return v->integer[0] &  0x7FFFFFFFFFFFFFFFull;
-}
-
-/*
- * then for each different definition of long double, we create a union to
- * unpack the float data safely. We can then copy these integers to a
- * FloatVal128.
+ * The FormatPositional and FormatScientific functions have been more
+ * significantly rewritten relative to Ryan Juckett's code.
+ *
+ * The binary16 and the various 128-bit float functions are new, and adapted
+ * from the 64 bit version. The python interface functions are new.
  */
-#ifdef NPY_FLOAT128
-typedef union FloatUnion128
-{
-    npy_float128 floatingPoint;
-    struct {
-        npy_uint64 a;
-        npy_uint16 b;
-    } integer;
-} FloatUnion128;
-#endif
-
-#ifdef NPY_FLOAT96
-typedef union FloatUnion96
-{
-    npy_float96 floatingPoint;
-    struct {
-        npy_uint64 a;
-        npy_uint32 b;
-    } integer;
-} FloatUnion96;
-#endif
-
-#ifdef NPY_FLOAT80
-typedef union FloatUnion80
-{
-    npy_float80 floatingPoint;
-    struct {
-        npy_uint64 a;
-        npy_uint16 b;
-    } integer;
-} FloatUnion80;
-#endif
 
 
-/*
- * The main changes above this point, relative to Ryan Juckett's code, are:
- *  1. fixed overflow problems when mantissa was 64 bits (in float128 types),
- *     by replacing multiplication by 2 or 4 by BigInt_ShiftLeft calls.
- *  2. Increased c_BigInt_MaxBlocks
- *  3. Added more entries to the g_PowerOf10_Big table
- *  4. Added unbiased rounding calculation with isEven
+/* Options struct for easy passing of Dragon4 options.
  *
- * Below this point, the FormatPositional and FormatScientific functions have
- * been more significantly rewritten. The Dragon4_PrintFloat16 and
- * Dragon4_PrintFloat128 functions are new, and were adapted from the 64 and 32
- * bit versions. The python interfacing functions (in the header) are new.
+ *   scientific - boolean controlling whether scientific notation is used
+ *   digit_mode - whether to use unique or fixed fracional output
+ *   cutoff_mode - whether 'precision' refers to toal digits, or digits past
+ *                 the decimal point.
+ *   precision - When negative, prints as many digits as needed for a unique
+ *               number. When positive specifies the maximum number of
+ *               significant digits to print.
+ *   sign - whether to always show sign
+ *   trim_mode - how to treat trailing 0s and '.'. See TrimMode comments.
+ *   digits_left - pad characters to left of decimal point. -1 for no padding
+ *   digits_right - pad characters to right of decimal point. -1 for no padding.
+ *                  Padding adds whitespace until there are the specified
+ *                  number characters to sides of decimal point. Applies after
+ *                  trim_mode characters were removed. If digits_right is
+ *                  positive and the decimal point was trimmed, decimal point
+ *                  will be replaced by a whitespace character.
+ *   exp_digits - Only affects scientific output. If positive, pads the
+ *                exponent with 0s until there are this many digits. If
+ *                negative, only use sufficient digits.
  */
-
+typedef struct Dragon4_Options {
+    npy_bool scientific;
+    DigitMode digit_mode;
+    CutoffMode cutoff_mode;
+    npy_int32 precision;
+    npy_bool sign;
+    TrimMode trim_mode;
+    npy_int32 digits_left;
+    npy_int32 digits_right;
+    npy_int32 exp_digits;
+} Dragon4_Options;
 
 /*
  * Outputs the positive number with positional notation: ddddd.dddd
  * The output is always NUL terminated and the output length (not including the
  * NUL) is returned.
+ *
  * Arguments:
  *    buffer - buffer to output into
  *    bufferSize - maximum characters that can be printed to buffer
@@ -1567,20 +1606,11 @@ typedef union FloatUnion80
  *    signbit - value of the sign position. Should be '+', '-' or ''
  *    mantissaBit - index of the highest set mantissa bit
  *    hasUnequalMargins - is the high margin twice as large as the low margin
- *    precision - Negative prints as many digits as are needed for a unique
- *                number. Positive specifies the maximum number of significant
- *                digits to print past the decimal point.
- *    trim_mode - how to treat trailing 0s and '.'. See TrimMode comments.
- *    digits_left - pad characters to left of decimal point. -1 for no padding
- *    digits_right - pad characters to right of decimal point. -1 for no padding
- *                   padding adds whitespace until there are the specified
- *                   number characters to sides of decimal point. Applies after
- *                   trim_mode characters were removed. If digits_right is
- *                   positive and the decimal point was trimmed, decimal point
- *                   will be replaced by a whitespace character.
+ *
+ * See Dragon4_Options for description of remaining arguments.
  */
 static npy_uint32
-FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
+FormatPositional(char *buffer, npy_uint32 bufferSize, BigInt *mantissa,
                  npy_int32 exponent, char signbit, npy_uint32 mantissaBit,
                  npy_bool hasUnequalMargins, DigitMode digit_mode,
                  CutoffMode cutoff_mode, npy_int32 precision,
@@ -1707,7 +1737,7 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
 
     /* always add decimal point, except for DprZeros mode */
     if (trim_mode != TrimMode_DptZeros && numFractionDigits == 0 &&
-            pos < maxPrintLen){
+            pos < maxPrintLen) {
         buffer[pos++] = '.';
     }
 
@@ -1745,7 +1775,7 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
      * when rounding, we may still end up with trailing zeros. Remove them
      * depending on trim settings.
      */
-    if (precision >= 0 && trim_mode != TrimMode_None && numFractionDigits > 0){
+    if (precision >= 0 && trim_mode != TrimMode_None && numFractionDigits > 0) {
         while (buffer[pos-1] == '0') {
             pos--;
             numFractionDigits--;
@@ -1779,7 +1809,7 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
         npy_int32 shift = digits_left - (numWholeDigits + has_sign);
         npy_int32 count = pos;
 
-        if (count + shift > maxPrintLen){
+        if (count + shift > maxPrintLen) {
             count = maxPrintLen - shift;
         }
 
@@ -1803,6 +1833,7 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
  * Outputs the positive number with scientific notation: d.dddde[sign]ddd
  * The output is always NUL terminated and the output length (not including the
  * NUL) is returned.
+ *
  * Arguments:
  *    buffer - buffer to output into
  *    bufferSize - maximum characters that can be printed to buffer
@@ -1811,15 +1842,11 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
  *    signbit - value of the sign position. Should be '+', '-' or ''
  *    mantissaBit - index of the highest set mantissa bit
  *    hasUnequalMargins - is the high margin twice as large as the low margin
- *    precision - Negative prints as many digits as are needed for a unique
- *                number. Positive specifies the maximum number of significant
- *                digits to print past the decimal point.
- *    trim_mode - how to treat trailing 0s and '.'. See TrimMode comments.
- *    digits_left - pad characters to left of decimal point. -1 for no padding
- *    exp_digits - pads exponent with zeros until it has this many digits
+ *
+ * See Dragon4_Options for description of remaining arguments.
  */
 static npy_uint32
-FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
+FormatScientific (char *buffer, npy_uint32 bufferSize, BigInt *mantissa,
                   npy_int32 exponent, char signbit, npy_uint32 mantissaBit,
                   npy_bool hasUnequalMargins, DigitMode digit_mode,
                   npy_int32 precision, TrimMode trim_mode,
@@ -1844,7 +1871,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
     leftchars = 1 + (signbit == '-' || signbit == '+');
     if (digits_left > leftchars) {
         int i;
-        for (i = 0; i < digits_left - leftchars && bufferSize > 1; i++){
+        for (i = 0; i < digits_left - leftchars && bufferSize > 1; i++) {
             *pCurOut = ' ';
             pCurOut++;
             --bufferSize;
@@ -1892,7 +1919,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
 
     /* always add decimal point, except for DprZeros mode */
     if (trim_mode != TrimMode_DptZeros && numFractionDigits == 0 &&
-            bufferSize > 1){
+            bufferSize > 1) {
         *pCurOut = '.';
         ++pCurOut;
         --bufferSize;
@@ -1931,7 +1958,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
      * when rounding, we may still end up with trailing zeros. Remove them
      * depending on trim settings.
      */
-    if (precision >= 0 && trim_mode != TrimMode_None && numFractionDigits > 0){
+    if (precision >= 0 && trim_mode != TrimMode_None && numFractionDigits > 0) {
         --pCurOut;
         while (*pCurOut == '0') {
             --pCurOut;
@@ -1972,7 +1999,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
         DEBUG_ASSERT(printExponent < 100000);
 
         /* get exp digits */
-        for (i = 0; i < 5; i++){
+        for (i = 0; i < 5; i++) {
             digits[i] = printExponent % 10;
             printExponent /= 10;
         }
@@ -1981,7 +2008,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
         }
         exp_size = i;
         /* write remaining digits to tmp buf */
-        for (i = exp_size; i > 0; i--){
+        for (i = exp_size; i > 0; i--) {
             exponentBuffer[2 + (exp_size-i)] = (char)('0' + digits[i-1]);
         }
 
@@ -2057,12 +2084,12 @@ PrintInfNan(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
 
         /* only print sign for inf values (though nan can have a sign set) */
         if (signbit == '+') {
-            if (pos < maxPrintLen-1){
+            if (pos < maxPrintLen-1) {
                 buffer[pos++] = '+';
             }
         }
         else if (signbit == '-') {
-            if (pos < maxPrintLen-1){
+            if (pos < maxPrintLen-1) {
                 buffer[pos++] = '-';
             }
         }
@@ -2080,7 +2107,9 @@ PrintInfNan(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
         buffer[pos + printLen] = '\0';
 
         /*
-         * // XXX: Should we change this for numpy?
+         *  For numpy we ignore unusual mantissa values for nan, but keep this
+         *  code in case we change our mind later.
+         *
          * // append HEX value
          * if (maxPrintLen > 3) {
          *     printLen += PrintHex(buffer+3, bufferSize-3, mantissa,
@@ -2093,34 +2122,63 @@ PrintInfNan(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
 }
 
 /*
- * These functions print a floating-point number as a decimal string.
- * The output string is always NUL terminated and the string length (not
- * including the NUL) is returned.
+ * The functions below format a floating-point numbers stored in particular
+ * formats,  as a decimal string.  The output string is always NUL terminated
+ * and the string length (not including the NUL) is returned.
+ *
+ * For 16, 32 and 64 bit floats we assume they are the IEEE 754 type.
+ * For 128 bit floats we account for different definitions.
  *
  * Arguments are:
  *   buffer - buffer to output into
  *   bufferSize - maximum characters that can be printed to buffer
- *   value - value significand
- *   scientific - boolean controlling whether scientific notation is used
- *   precision - If positive, specifies the number of decimals to show after
- *               decimal point. If negative, sufficient digits to uniquely
- *               specify the float will be output.
- *   trim_mode - how to treat trailing zeros and decimal point. See TrimMode.
- *   digits_right - pad the result with '' on the right past the decimal point
- *   digits_left - pad the result with '' on the right past the decimal point
- *   exp_digits - Only affects scientific output. If positive, pads the
- *                exponent with 0s until there are this many digits. If
- *                negative, only use sufficient digits.
+ *   value - value to print
+ *   opt - Dragon4 options, see above
+ */
+
+/*
+ * Helper function that takes Dragon4 parameters and options and
+ * calls Dragon4.
+ */
+static npy_uint32
+Format_floatbits(char *buffer, npy_uint32 bufferSize, BigInt *mantissa,
+                 npy_int32 exponent, char signbit, npy_uint32 mantissaBit,
+                 npy_bool hasUnequalMargins, Dragon4_Options *opt)
+{
+    /* format the value */
+    if (opt->scientific) {
+        return FormatScientific(buffer, bufferSize, mantissa, exponent,
+                                signbit, mantissaBit, hasUnequalMargins,
+                                opt->digit_mode, opt->precision,
+                                opt->trim_mode, opt->digits_left,
+                                opt->exp_digits);
+    }
+    else {
+        return FormatPositional(buffer, bufferSize, mantissa, exponent,
+                                signbit, mantissaBit, hasUnequalMargins,
+                                opt->digit_mode, opt->cutoff_mode,
+                                opt->precision, opt->trim_mode,
+                                opt->digits_left, opt->digits_right);
+    }
+}
+
+/*
+ * IEEE binary16 floating-point format
+ *
+ * sign:      1 bit
+ * exponent:  5 bits
+ * mantissa: 10 bits
  */
 static npy_uint32
-Dragon4_PrintFloat16(char *buffer, npy_uint32 bufferSize, npy_uint16 value,
-                     npy_bool scientific, DigitMode digit_mode,
-                     CutoffMode cutoff_mode, npy_int32 precision,
-                     npy_bool sign, TrimMode trim_mode, npy_int32 digits_left,
-                     npy_int32 digits_right, npy_int32 exp_digits)
+Dragon4_PrintFloat_IEEE_binary16(
+        Dragon4_Scratch *scratch, npy_half *value, Dragon4_Options *opt)
 {
-    FloatUnion16 floatUnion;
-    npy_uint32 floatExponent, floatMantissa;
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    npy_uint16 val = *value;
+    npy_uint32 floatExponent, floatMantissa, floatSign;
 
     npy_uint32 mantissa;
     npy_int32 exponent;
@@ -2138,20 +2196,20 @@ Dragon4_PrintFloat16(char *buffer, npy_uint32 bufferSize, npy_uint16 value,
     }
 
     /* deconstruct the floating point value */
-    floatUnion.integer = value;
-    floatExponent = GetExponent_F16(&floatUnion);
-    floatMantissa = GetMantissa_F16(&floatUnion);
+    floatMantissa = val & bitmask_u32(10);
+    floatExponent = (val >> 10) & bitmask_u32(5);
+    floatSign = val >> 15;
 
     /* output the sign */
-    if (IsNegative_F16(&floatUnion)) {
+    if (floatSign != 0) {
         signbit = '-';
     }
-    else if (sign) {
+    else if (opt->sign) {
         signbit = '+';
     }
 
     /* if this is a special value */
-    if (floatExponent == 0x1F) {
+    if (floatExponent == bitmask_u32(5)) {
         return PrintInfNan(buffer, bufferSize, floatMantissa, 3, signbit);
     }
     /* else this is a number */
@@ -2195,29 +2253,33 @@ Dragon4_PrintFloat16(char *buffer, npy_uint32 bufferSize, npy_uint16 value,
         hasUnequalMargins  = NPY_FALSE;
     }
 
-    /* format the value */
-    if (scientific) {
-        return FormatScientific(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                precision, trim_mode, digits_left, exp_digits);
-    }
-    else {
-        return FormatPositional(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                cutoff_mode, precision, trim_mode,
-                                digits_left, digits_right);
-    }
+    BigInt_Set_uint32(&bigints[0], mantissa);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                            signbit, mantissaBit, hasUnequalMargins, opt);
 }
 
+/*
+ * IEEE binary32 floating-point format
+ *
+ * sign:      1 bit
+ * exponent:  8 bits
+ * mantissa: 23 bits
+ */
 static npy_uint32
-Dragon4_PrintFloat32(char *buffer, npy_uint32 bufferSize, npy_float32 value,
-                     npy_bool scientific, DigitMode digit_mode,
-                     CutoffMode cutoff_mode, npy_int32 precision,
-                     npy_bool sign, TrimMode trim_mode, npy_int32 digits_left,
-                     npy_int32 digits_right, npy_int32 exp_digits)
+Dragon4_PrintFloat_IEEE_binary32(
+        Dragon4_Scratch *scratch, npy_float32 *value,
+        Dragon4_Options *opt)
 {
-    FloatUnion32 floatUnion;
-    npy_uint32 floatExponent, floatMantissa;
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    union
+    {
+        npy_float32 floatingPoint;
+        npy_uint32 integer;
+    } floatUnion;
+    npy_uint32 floatExponent, floatMantissa, floatSign;
 
     npy_uint32 mantissa;
     npy_int32 exponent;
@@ -2235,20 +2297,21 @@ Dragon4_PrintFloat32(char *buffer, npy_uint32 bufferSize, npy_float32 value,
     }
 
     /* deconstruct the floating point value */
-    floatUnion.floatingPoint = value;
-    floatExponent = GetExponent_F32(&floatUnion);
-    floatMantissa = GetMantissa_F32(&floatUnion);
+    floatUnion.floatingPoint = *value;
+    floatMantissa = floatUnion.integer & bitmask_u32(23);
+    floatExponent = (floatUnion.integer >> 23) & bitmask_u32(8);
+    floatSign = floatUnion.integer >> 31;
 
     /* output the sign */
-    if (IsNegative_F32(&floatUnion)) {
+    if (floatSign != 0) {
         signbit = '-';
     }
-    else if (sign) {
+    else if (opt->sign) {
         signbit = '+';
     }
 
     /* if this is a special value */
-    if (floatExponent == 0xFF) {
+    if (floatExponent == bitmask_u32(8)) {
         return PrintInfNan(buffer, bufferSize, floatMantissa, 6, signbit);
     }
     /* else this is a number */
@@ -2292,29 +2355,32 @@ Dragon4_PrintFloat32(char *buffer, npy_uint32 bufferSize, npy_float32 value,
         hasUnequalMargins  = NPY_FALSE;
     }
 
-    /* format the value */
-    if (scientific) {
-        return FormatScientific(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                precision, trim_mode, digits_left, exp_digits);
-    }
-    else {
-        return FormatPositional(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                cutoff_mode, precision, trim_mode,
-                                digits_left, digits_right);
-    }
+    BigInt_Set_uint32(&bigints[0], mantissa);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                           signbit, mantissaBit, hasUnequalMargins, opt);
 }
 
+/*
+ * IEEE binary64 floating-point format
+ *
+ * sign:      1 bit
+ * exponent: 11 bits
+ * mantissa: 52 bits
+ */
 static npy_uint32
-Dragon4_PrintFloat64(char *buffer, npy_uint32 bufferSize, npy_float64 value,
-                     npy_bool scientific, DigitMode digit_mode,
-                     CutoffMode cutoff_mode, npy_int32 precision,
-                     npy_bool sign, TrimMode trim_mode, npy_int32 digits_left,
-                     npy_int32 digits_right, npy_int32 exp_digits)
+Dragon4_PrintFloat_IEEE_binary64(
+        Dragon4_Scratch *scratch, npy_float64 *value, Dragon4_Options *opt)
 {
-    FloatUnion64 floatUnion;
-    npy_uint32 floatExponent;
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    union
+    {
+        npy_float64 floatingPoint;
+        npy_uint64 integer;
+    } floatUnion;
+    npy_uint32 floatExponent, floatSign;
     npy_uint64 floatMantissa;
 
     npy_uint64 mantissa;
@@ -2333,20 +2399,21 @@ Dragon4_PrintFloat64(char *buffer, npy_uint32 bufferSize, npy_float64 value,
     }
 
     /* deconstruct the floating point value */
-    floatUnion.floatingPoint = value;
-    floatExponent = GetExponent_F64(&floatUnion);
-    floatMantissa = GetMantissa_F64(&floatUnion);
+    floatUnion.floatingPoint = *value;
+    floatMantissa = floatUnion.integer & bitmask_u64(52);
+    floatExponent = (floatUnion.integer >> 52) & bitmask_u32(11);
+    floatSign = floatUnion.integer >> 63;
 
     /* output the sign */
-    if (IsNegative_F64(&floatUnion)) {
+    if (floatSign != 0) {
         signbit = '-';
     }
-    else if (sign) {
+    else if (opt->sign) {
         signbit = '+';
     }
 
     /* if this is a special value */
-    if (floatExponent == 0x7FF) {
+    if (floatExponent == bitmask_u32(11)) {
         return PrintInfNan(buffer, bufferSize, floatMantissa, 13, signbit);
     }
     /* else this is a number */
@@ -2390,28 +2457,48 @@ Dragon4_PrintFloat64(char *buffer, npy_uint32 bufferSize, npy_float64 value,
         hasUnequalMargins   = NPY_FALSE;
     }
 
-    /* format the value */
-    if (scientific) {
-        return FormatScientific(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                precision, trim_mode, digits_left, exp_digits);
-    }
-    else {
-        return FormatPositional(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                cutoff_mode, precision, trim_mode,
-                                digits_left, digits_right);
-    }
+    BigInt_Set_uint64(&bigints[0], mantissa);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                            signbit, mantissaBit, hasUnequalMargins, opt);
 }
 
+
+/*
+ * Since systems have different types of long doubles, and may not necessarily
+ * have a 128-byte format we can use to pass values around, here we create
+ * our own 128-bit storage type for convenience.
+ */
+typedef struct FloatVal128 {
+    npy_uint64 hi, lo;
+} FloatVal128;
+
+#if defined(HAVE_LDOUBLE_INTEL_EXTENDED_10_BYTES_LE) || \
+    defined(HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE) || \
+    defined(HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE) || \
+    defined(HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE)
+/*
+ * Intel's 80-bit IEEE extended precision floating-point format
+ *
+ * "long doubles" with this format are stored as 96 or 128 bits, but
+ * are equivalent to the 80 bit type with some zero padding on the high bits.
+ * This method expects the user to pass in the value using a 128-bit
+ * FloatVal128, so can support 80, 96, or 128 bit storage formats,
+ * and is endian-independent.
+ *
+ * sign:      1 bit,  second u64
+ * exponent: 15 bits, second u64
+ * intbit     1 bit,  first u64
+ * mantissa: 63 bits, first u64
+ */
 static npy_uint32
-Dragon4_PrintFloat128(char *buffer, npy_uint32 bufferSize, FloatVal128 value,
-                      npy_bool scientific, DigitMode digit_mode,
-                      CutoffMode cutoff_mode, npy_int32 precision,
-                      npy_bool sign, TrimMode trim_mode, npy_int32 digits_left,
-                      npy_int32 digits_right, npy_int32 exp_digits)
+Dragon4_PrintFloat_Intel_extended(
+    Dragon4_Scratch *scratch, FloatVal128 value, Dragon4_Options *opt)
 {
-    npy_uint32 floatExponent;
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    npy_uint32 floatExponent, floatSign;
     npy_uint64 floatMantissa;
 
     npy_uint64 mantissa;
@@ -2429,20 +2516,27 @@ Dragon4_PrintFloat128(char *buffer, npy_uint32 bufferSize, FloatVal128 value,
         return 0;
     }
 
-    /* deconstruct the floating point value */
-    floatExponent = GetExponent_F128(&value);
-    floatMantissa = GetMantissa_F128(&value);
+    /* deconstruct the floating point value (we ignore the intbit) */
+    floatMantissa = value.lo & bitmask_u64(63);
+    floatExponent = value.hi & bitmask_u32(15);
+    floatSign = (value.hi >> 15) & 0x1;
 
     /* output the sign */
-    if (IsNegative_F128(&value)) {
+    if (floatSign != 0) {
         signbit = '-';
     }
-    else if (sign) {
+    else if (opt->sign) {
         signbit = '+';
     }
 
     /* if this is a special value */
-    if (floatExponent == 0x7FFF) {
+    if (floatExponent == bitmask_u32(15)) {
+        /*
+         * Note: Technically there are other special extended values defined if
+         * the intbit is 0, like Pseudo-Infinity, Pseudo-Nan, Quiet-NaN. We
+         * ignore all of these since they are not generated on modern
+         * processors. We treat Quiet-Nan as simply Nan.
+         */
         return PrintInfNan(buffer, bufferSize, floatMantissa, 16, signbit);
     }
     /* else this is a number */
@@ -2486,247 +2580,662 @@ Dragon4_PrintFloat128(char *buffer, npy_uint32 bufferSize, FloatVal128 value,
         hasUnequalMargins   = NPY_FALSE;
     }
 
-    /* format the value */
-    if (scientific) {
-        return FormatScientific(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                precision, trim_mode, digits_left, exp_digits);
+    BigInt_Set_uint64(&bigints[0], mantissa);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                            signbit, mantissaBit, hasUnequalMargins, opt);
+
+}
+
+#endif /* INTEL_EXTENDED group */
+
+
+#ifdef HAVE_LDOUBLE_INTEL_EXTENDED_10_BYTES_LE
+/*
+ * Intel's 80-bit IEEE extended precision format, 80-bit storage
+ *
+ * Note: It is not clear if a long double with 10-byte storage exists on any
+ * system. But numpy defines NPY_FLOAT80, so if we come across it, assume it is
+ * an Intel extended format.
+ */
+static npy_uint32
+Dragon4_PrintFloat_Intel_extended80(
+    Dragon4_Scratch *scratch, npy_float80 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
+    union {
+        npy_float80 floatingPoint;
+        struct {
+            npy_uint64 a;
+            npy_uint16 b;
+        } integer;
+    } buf80;
+
+    buf80.floatingPoint = *value;
+    /* Intel is little-endian */
+    val128.lo = buf80.integer.a;
+    val128.hi = buf80.integer.b;
+
+    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+}
+#endif /* HAVE_LDOUBLE_INTEL_EXTENDED_10_BYTES_LE */
+
+#ifdef HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE
+/* Intel's 80-bit IEEE extended precision format, 96-bit storage */
+static npy_uint32
+Dragon4_PrintFloat_Intel_extended96(
+    Dragon4_Scratch *scratch, npy_float96 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
+    union {
+        npy_float96 floatingPoint;
+        struct {
+            npy_uint64 a;
+            npy_uint32 b;
+        } integer;
+    } buf96;
+
+    buf96.floatingPoint = *value;
+    /* Intel is little-endian */
+    val128.lo = buf96.integer.a;
+    val128.hi = buf96.integer.b;
+
+    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+}
+#endif /* HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE */
+
+#ifdef HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE
+/* Motorola Big-endian equivalent of the Intel-extended 96 fp format */
+static npy_uint32
+Dragon4_PrintFloat_Motorola_extended96(
+    Dragon4_Scratch *scratch, npy_float96 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
+    union {
+        npy_float96 floatingPoint;
+        struct {
+            npy_uint64 a;
+            npy_uint32 b;
+        } integer;
+    } buf96;
+
+    buf96.floatingPoint = *value;
+    /* Motorola is big-endian */
+    val128.lo = buf96.integer.b;
+    val128.hi = buf96.integer.a >> 16;
+    /* once again we assume the int has same endianness as the float */
+
+    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+}
+#endif /* HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE */
+
+
+#ifdef NPY_FLOAT128
+
+typedef union FloatUnion128
+{
+    npy_float128 floatingPoint;
+    struct {
+        npy_uint64 a;
+        npy_uint64 b;
+    } integer;
+} FloatUnion128;
+
+#ifdef HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE
+/* Intel's 80-bit IEEE extended precision format, 128-bit storage */
+static npy_uint32
+Dragon4_PrintFloat_Intel_extended128(
+    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
+    FloatUnion128 buf128;
+
+    buf128.floatingPoint = *value;
+    /* Intel is little-endian */
+    val128.lo = buf128.integer.a;
+    val128.hi = buf128.integer.b;
+
+    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+}
+#endif /* HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE */
+
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
+/*
+ * IEEE binary128 floating-point format
+ *
+ * sign:       1 bit
+ * exponent:  15 bits
+ * mantissa: 112 bits
+ *
+ * Currently binary128 format exists on only a few CPUs, such as on the POWER9
+ * arch. Because of this, this code has not been tested. I am not sure if the
+ * arch also supports uint128, and C does not seem to support int128 literals.
+ * So we use uint64 to do manipulation. Unfortunately this means we are endian
+ * dependent. Assume little-endian for now, can fix later once binary128
+ * becomes more common.
+ */
+static npy_uint32
+Dragon4_PrintFloat_IEEE_binary128(
+    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+{
+    FloatUnion128 buf128;
+
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    npy_uint32 floatExponent, floatSign;
+
+    npy_uint64 mantissa_hi, mantissa_lo;
+    npy_int32 exponent;
+    npy_uint32 mantissaBit;
+    npy_bool hasUnequalMargins;
+    char signbit = '\0';
+
+    buf128.floatingPoint = *value;
+
+    if (bufferSize == 0) {
+        return 0;
+    }
+
+    if (bufferSize == 1) {
+        buffer[0] = '\0';
+        return 0;
+    }
+
+    /* Assumes little-endian !!! */
+    mantissa_hi = buf128.integer.a & bitmask_u64(48);
+    mantissa_lo = buf128.integer.b;
+    floatExponent = (buf128.integer.a >> 48) & bitmask_u32(15);
+    floatSign = buf128.integer.a >> 63;
+
+    /* output the sign */
+    if (floatSign != 0) {
+        signbit = '-';
+    }
+    else if (opt->sign) {
+        signbit = '+';
+    }
+
+    /* if this is a special value */
+    if (floatExponent == bitmask_u32(15)) {
+        npy_uint64 mantissa_zero = mantissa_hi == 0 && mantissa_lo == 0;
+        return PrintInfNan(buffer, bufferSize, !mantissa_zero, 16, signbit);
+    }
+    /* else this is a number */
+
+    /* factor the value into its parts */
+    if (floatExponent != 0) {
+        /*
+         * normal
+         * The floating point equation is:
+         *  value = (1 + mantissa/2^112) * 2 ^ (exponent-16383)
+         * We convert the integer equation by factoring a 2^112 out of the
+         * exponent
+         *  value = (1 + mantissa/2^112) * 2^112 * 2 ^ (exponent-16383-112)
+         *  value = (2^112 + mantissa) * 2 ^ (exponent-16383-112)
+         * Because of the implied 1 in front of the mantissa we have 112 bits of
+         * precision.
+         *   m = (2^112 + mantissa)
+         *   e = (exponent-16383+1-112)
+         *
+         *   Adding 2^112 to the mantissa is the same as adding 2^48 to the hi
+         *   64 bit part.
+         */
+        mantissa_hi         = (1ull << 48) | mantissa_hi;
+        /* mantissa_lo is unchanged */
+        exponent            = floatExponent - 16383 - 112;
+        mantissaBit         = 112;
+        hasUnequalMargins   = (floatExponent != 1) && (mantissa_hi == 0 &&
+                                                       mantissa_lo == 0);
     }
     else {
-        return FormatPositional(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                cutoff_mode, precision, trim_mode,
-                                digits_left, digits_right);
+        /*
+         * subnormal
+         * The floating point equation is:
+         *  value = (mantissa/2^112) * 2 ^ (1-16383)
+         * We convert the integer equation by factoring a 2^112 out of the
+         * exponent
+         *  value = (mantissa/2^112) * 2^112 * 2 ^ (1-16383-112)
+         *  value = mantissa * 2 ^ (1-16383-112)
+         * We have up to 112 bits of precision.
+         *   m = (mantissa)
+         *   e = (1-16383-112)
+         */
+        exponent            = 1 - 16383 - 112;
+        mantissaBit         = LogBase2_128(mantissa_hi, mantissa_lo);
+        hasUnequalMargins   = NPY_FALSE;
     }
+
+    BigInt_Set_2x_uint64(&bigints[0], mantissa_hi, mantissa_lo);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                            signbit, mantissaBit, hasUnequalMargins, opt);
 }
+#endif /* HAVE_LDOUBLE_IEEE_QUAD_LE */
 
-PyObject *
-Dragon4_Positional_AnySize(void *val, size_t size, DigitMode digit_mode,
-                           CutoffMode cutoff_mode, int precision, int sign,
-                           TrimMode trim, int pad_left, int pad_right)
+#if (defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE) || \
+     defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE))
+/*
+ * IBM extended precision 128-bit floating-point format, aka IBM double-dobule
+ *
+ * IBM's double-double type is a pair of IEEE binary64 values, which you add
+ * together to get a total value. The exponents are arranged so that the lower
+ * double is about 2^52 times smaller than the high one, and the nearest
+ * float64 value is simply the upper double, in which case the pair is
+ * considered "normalized" (not to confuse with "normal" and "subnormal"
+ * binary64 values). We assume normalized values. You can see the glibc's
+ * printf on ppc does so too by constructing un-normalized values to get
+ * strange behavior from the OS printf:
+ *
+ *     >>> from numpy.core._multiarray_tests import format_float_OSprintf_g
+ *     >>> x = np.array([0.3,0.3], dtype='f8').view('f16')[0]
+ *     >>> format_float_OSprintf_g(x, 2)
+ *     0.30
+ *     >>> format_float_OSprintf_g(2*x, 2)
+ *     1.20
+ *
+ * If we don't assume normalization, x should really print as 0.6.
+ *
+ * For normalized values gcc assumes that the total mantissa is no
+ * more than 106 bits (53+53), so we can drop bits from the second double which
+ * would be pushed past 106 when left-shifting by its exponent, as happens
+ * sometimes. (There has been debate about this, see
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?format=multiple&id=70117,
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=22752 )
+ *
+ * Note: This function is for the IBM-double-double which is a pair of IEEE
+ * binary64 floats, like on ppc64 systems. This is *not* the hexadecimal
+ * IBM-double-double type, which is a pair of IBM hexadecimal64 floats.
+ *
+ * See also:
+ * https://gcc.gnu.org/wiki/Ieee128PowerPCA
+ * https://www.ibm.com/support/knowledgecenter/en/ssw_aix_71/com.ibm.aix.genprogc/128bit_long_double_floating-point_datatype.htm
+ */
+static npy_uint32
+Dragon4_PrintFloat_IBM_double_double(
+    Dragon4_Scratch *scratch, FloatVal128 val128, Dragon4_Options *opt)
+{
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    npy_uint32 floatExponent1, floatExponent2;
+    npy_uint64 floatMantissa1, floatMantissa2;
+    npy_uint32 floatSign1, floatSign2;
+
+    npy_uint64 mantissa1, mantissa2;
+    npy_int32 exponent1, exponent2;
+    int shift;
+    npy_uint32 mantissaBit;
+    npy_bool hasUnequalMargins;
+    char signbit = '\0';
+
+    if (bufferSize == 0) {
+        return 0;
+    }
+
+    if (bufferSize == 1) {
+        buffer[0] = '\0';
+        return 0;
+    }
+
+    /* deconstruct the floating point values */
+    floatMantissa1 = val128.hi & bitmask_u64(52);
+    floatExponent1 = (val128.hi >> 52) & bitmask_u32(11);
+    floatSign1 = (val128.hi >> 63) != 0;
+
+    floatMantissa2 = val128.lo & bitmask_u64(52);
+    floatExponent2 = (val128.lo >> 52) & bitmask_u32(11);
+    floatSign2 = (val128.lo >> 63) != 0;
+
+    /* output the sign using 1st float's sign */
+    if (floatSign1) {
+        signbit = '-';
+    }
+    else if (opt->sign) {
+        signbit = '+';
+    }
+
+    /* we only need to look at the first float for inf/nan */
+    if (floatExponent1 == bitmask_u32(11)) {
+        return PrintInfNan(buffer, bufferSize, floatMantissa1, 13, signbit);
+    }
+
+    /* else this is a number */
+
+    /* Factor the 1st value into its parts, see binary64 for comments. */
+    if (floatExponent1 == 0) {
+        /*
+         * If the first number is a subnormal value, the 2nd has to be 0 for
+         * the float128 to be normalized, so we can ignore it. In this case
+         * the float128 only has the precision of a single binary64 value.
+         */
+        mantissa1            = floatMantissa1;
+        exponent1            = 1 - 1023 - 52;
+        mantissaBit          = LogBase2_64(mantissa1);
+        hasUnequalMargins    = NPY_FALSE;
+
+        BigInt_Set_uint64(&bigints[0], mantissa1);
+    }
+    else {
+        mantissa1            = (1ull << 52) | floatMantissa1;
+        exponent1            = floatExponent1 - 1023 - 52;
+        mantissaBit          = 52 + 53;
+
+        /*
+         * Computing hasUnequalMargins and mantissaBit:
+         * This is a little trickier than for IEEE formats.
+         *
+         * When both doubles are "normal" it is clearer since we can think of
+         * it as an IEEE type with a 106 bit mantissa. This value can never
+         * have "unequal" margins because of the implied 1 bit in the 2nd
+         * value.  (unequal margins only happen when the mantissa has a value
+         * like "10000000000...", all zeros except the implied bit at the
+         * start, since the next lowest number has a different exponent).
+         * mantissaBits will always be 52+53 in this case.
+         *
+         * If the 1st number is a very small normal, and the 2nd is subnormal
+         * and not 2^52 times smaller, the number behaves like a subnormal
+         * overall, where the upper number just adds some bits on the left.
+         * Like usual subnormals, it has "equal" margins. The slightly tricky
+         * thing is that the number of mantissaBits varies. It will be 52
+         * (from lower double) plus a variable number depending on the upper
+         * number's exponent. We recompute the number of bits in the shift
+         * calculation below, because the shift will be equal to the number of
+         * lost bits.
+         *
+         * We can get unequal margins only if the first value has all-0
+         * mantissa (except implied bit), and the second value is exactly 0. As
+         * a special exception the smallest normal value (smallest exponent, 0
+         * mantissa) should have equal margins, since it is "next to" a
+         * subnormal value.
+         */
+
+        /* factor the 2nd value into its parts */
+        if (floatExponent2 != 0) {
+            mantissa2            = (1ull << 52) | floatMantissa2;
+            exponent2            = floatExponent2 - 1023 - 52;
+            hasUnequalMargins    = NPY_FALSE;
+        }
+        else {
+            /* shift exp by one so that leading mantissa bit is still bit 53 */
+            mantissa2            = floatMantissa2 << 1;
+            exponent2            = - 1023 - 52;
+            hasUnequalMargins  = (floatExponent1 != 1) && (floatMantissa1 == 0)
+                                                       && (floatMantissa2 == 0);
+        }
+
+        /*
+         * The 2nd val's exponent might not be exactly 52 smaller than the 1st,
+         * it can vary a little bit. So do some shifting of the low mantissa,
+         * so that the total mantissa is equivalent to bits 53 to 0 of the
+         * first double immediately followed by bits 53 to 0 of the second.
+         */
+        shift = exponent1 - exponent2 - 53;
+        if (shift > 0) {
+            /* shift more than 64 is undefined behavior */
+            mantissa2 = shift < 64 ? mantissa2 >> shift : 0;
+        }
+        else if (shift < 0) {
+            /*
+             * This only happens if the 2nd value is subnormal.
+             * We expect that shift > -64, but check it anyway
+             */
+            mantissa2 = -shift < 64 ? mantissa2 << -shift : 0;
+        }
+
+        /*
+         * If the low double is a different sign from the high double,
+         * rearrange so that the total mantissa is the sum of the two
+         * mantissas, instead of a subtraction.
+         * hi - lo  ->  (hi-1) + (1-lo),   where lo < 1
+         */
+        if (floatSign1 != floatSign2 && mantissa2 != 0) {
+            mantissa1--;
+            mantissa2 = (1ull << 53) - mantissa2;
+        }
+
+        /*
+         * Compute the number of bits if we are in the subnormal range.
+         * The value "shift" happens to be exactly the number of lost bits.
+         * Also, shift the bits so that the least significant bit is at
+         * bit position 0, like a typical subnormal. After this exponent1
+         * should always be 2^-1022
+         */
+        if (shift < 0) {
+            mantissa2 = (mantissa2 >> -shift) | (mantissa1 << (53 + shift));
+            mantissa1 = mantissa1 >> -shift;
+            mantissaBit = mantissaBit -(-shift);
+            exponent1 -= shift;
+            DEBUG_ASSERT(exponent1 == -1022);
+        }
+
+        /*
+         * set up the BigInt mantissa, by shifting the parts as needed
+         * We can use | instead of + since the mantissas should not overlap
+         */
+        BigInt_Set_2x_uint64(&bigints[0], mantissa1 >> 11,
+                                         (mantissa1 << 53) | (mantissa2));
+        exponent1 = exponent1 - 53;
+    }
+
+    return Format_floatbits(buffer, bufferSize, bigints, exponent1,
+                            signbit, mantissaBit, hasUnequalMargins, opt);
+}
+
+#if defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE)
+static npy_uint32
+Dragon4_PrintFloat_IBM_double_double_le(
+    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
 {
-    /*
-     * Use a very large buffer in case anyone tries to output a large numberG.
-     * 16384 should be enough to uniquely print any float128, which goes up
-     * to about 10^4932 */
-    static char repr[16384];
     FloatVal128 val128;
-#ifdef NPY_FLOAT80
-    FloatUnion80 buf80;;
-#endif
-#ifdef NPY_FLOAT96
-    FloatUnion96 buf96;
-#endif
-#ifdef NPY_FLOAT128
     FloatUnion128 buf128;
-#endif
 
-    switch (size) {
-        case 2:
-            Dragon4_PrintFloat16(repr, sizeof(repr), *(npy_float16*)val,
-                                 0, digit_mode, cutoff_mode, precision,
-                                 sign, trim, pad_left, pad_right, -1);
-            break;
-        case 4:
-            Dragon4_PrintFloat32(repr, sizeof(repr), *(npy_float32*)val,
-                                 0, digit_mode, cutoff_mode, precision,
-                                 sign, trim, pad_left, pad_right, -1);
-            break;
-        case 8:
-            Dragon4_PrintFloat64(repr, sizeof(repr), *(npy_float64*)val,
-                                 0, digit_mode, cutoff_mode, precision,
-                                 sign, trim, pad_left, pad_right, -1);
-            break;
-#ifdef NPY_FLOAT80
-        case 10:
-            buf80.floatingPoint = *(npy_float80*)val;
-            val128.integer[0] = buf80.integer.a;
-            val128.integer[1] = buf80.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  0, digit_mode, cutoff_mode, precision,
-                                  sign, trim, pad_left, pad_right, -1);
-            break;
-#endif
-#ifdef NPY_FLOAT96
-        case 12:
-            buf96.floatingPoint = *(npy_float96*)val;
-            val128.integer[0] = buf96.integer.a;
-            val128.integer[1] = buf96.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  0, digit_mode, cutoff_mode, precision,
-                                  sign, trim, pad_left, pad_right, -1);
-            break;
-#endif
-#ifdef NPY_FLOAT128
-        case 16:
-            buf128.floatingPoint = *(npy_float128*)val;
-            val128.integer[0] = buf128.integer.a;
-            val128.integer[1] = buf128.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  0, digit_mode, cutoff_mode, precision,
-                                  sign, trim, pad_left, pad_right, -1);
-            break;
-#endif
-        default:
-            PyErr_Format(PyExc_ValueError, "unexpected itemsize %zu", size);
-            return NULL;
-    }
+    buf128.floatingPoint = *value;
+    val128.lo = buf128.integer.a;
+    val128.hi = buf128.integer.b;
 
-    return PyUString_FromString(repr);
+    return Dragon4_PrintFloat_IBM_double_double(scratch, val128, opt);
 }
+#endif /* HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE */
+
+#if defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE)
+static npy_uint32
+Dragon4_PrintFloat_IBM_double_double_be(
+    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
+    FloatUnion128 buf128;
+
+    buf128.floatingPoint = *value;
+    val128.hi = buf128.integer.a;
+    val128.lo = buf128.integer.b;
+
+    return Dragon4_PrintFloat_IBM_double_double(scratch, val128, opt);
+}
+
+#endif /* HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE */
+
+#endif /* HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE | HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE */
+
+#endif /* NPY_FLOAT128 */
+
+
+/*
+ * Here we define two Dragon4 entry functions for each type. One of them
+ * accepts the args in a Dragon4_Options struct for convenience, the
+ * other enumerates only the necessary parameters.
+ *
+ * Use a very large string buffer in case anyone tries to output a large number.
+ * 16384 should be enough to exactly print the integer part of any float128,
+ * which goes up to about 10^4932. The Dragon4_scratch struct provides a string
+ * buffer of this size.
+ */
+#define make_dragon4_typefuncs_inner(Type, npy_type, format) \
+\
+PyObject *\
+Dragon4_Positional_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
+{\
+    PyObject *ret;\
+    Dragon4_Scratch *scratch = get_dragon4_bigint_scratch();\
+    if (scratch == NULL) {\
+        return NULL;\
+    }\
+    if (Dragon4_PrintFloat_##format(scratch, val, opt) < 0) {\
+        free_dragon4_bigint_scratch(scratch);\
+        return NULL;\
+    }\
+    ret = PyUString_FromString(scratch->repr);\
+    free_dragon4_bigint_scratch(scratch);\
+    return ret;\
+}\
+\
+PyObject *\
+Dragon4_Positional_##Type(npy_type *val, DigitMode digit_mode,\
+                   CutoffMode cutoff_mode, int precision,\
+                   int sign, TrimMode trim, int pad_left, int pad_right)\
+{\
+    Dragon4_Options opt;\
+    \
+    opt.scientific = 0;\
+    opt.digit_mode = digit_mode;\
+    opt.cutoff_mode = cutoff_mode;\
+    opt.precision = precision;\
+    opt.sign = sign;\
+    opt.trim_mode = trim;\
+    opt.digits_left = pad_left;\
+    opt.digits_right = pad_right;\
+    opt.exp_digits = -1;\
+\
+    return Dragon4_Positional_##Type##_opt(val, &opt);\
+}\
+\
+PyObject *\
+Dragon4_Scientific_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
+{\
+    PyObject *ret;\
+    Dragon4_Scratch *scratch = get_dragon4_bigint_scratch();\
+    if (scratch == NULL) {\
+        return NULL;\
+    }\
+    if (Dragon4_PrintFloat_##format(scratch, val, opt) < 0) {\
+        free_dragon4_bigint_scratch(scratch);\
+        return NULL;\
+    }\
+    ret = PyUString_FromString(scratch->repr);\
+    free_dragon4_bigint_scratch(scratch);\
+    return ret;\
+}\
+PyObject *\
+Dragon4_Scientific_##Type(npy_type *val, DigitMode digit_mode, int precision,\
+                   int sign, TrimMode trim, int pad_left, int exp_digits)\
+{\
+    Dragon4_Options opt;\
+\
+    opt.scientific = 1;\
+    opt.digit_mode = digit_mode;\
+    opt.cutoff_mode = CutoffMode_TotalLength;\
+    opt.precision = precision;\
+    opt.sign = sign;\
+    opt.trim_mode = trim;\
+    opt.digits_left = pad_left;\
+    opt.digits_right = -1;\
+    opt.exp_digits = exp_digits;\
+\
+    return Dragon4_Scientific_##Type##_opt(val, &opt);\
+}
+
+#define make_dragon4_typefuncs(Type, npy_type, format) \
+        make_dragon4_typefuncs_inner(Type, npy_type, format)
+
+make_dragon4_typefuncs(Half, npy_half, NPY_HALF_BINFMT_NAME)
+make_dragon4_typefuncs(Float, npy_float, NPY_FLOAT_BINFMT_NAME)
+make_dragon4_typefuncs(Double, npy_double, NPY_DOUBLE_BINFMT_NAME)
+make_dragon4_typefuncs(LongDouble, npy_longdouble, NPY_LONGDOUBLE_BINFMT_NAME)
+
+#undef make_dragon4_typefuncs
+#undef make_dragon4_typefuncs_inner
 
 PyObject *
 Dragon4_Positional(PyObject *obj, DigitMode digit_mode, CutoffMode cutoff_mode,
                    int precision, int sign, TrimMode trim, int pad_left,
                    int pad_right)
 {
-    double val;
+    npy_double val;
+    Dragon4_Options opt;
+
+    opt.scientific = 0;
+    opt.digit_mode = digit_mode;
+    opt.cutoff_mode = cutoff_mode;
+    opt.precision = precision;
+    opt.sign = sign;
+    opt.trim_mode = trim;
+    opt.digits_left = pad_left;
+    opt.digits_right = pad_right;
+    opt.exp_digits = -1;
 
     if (PyArray_IsScalar(obj, Half)) {
         npy_half x = ((PyHalfScalarObject *)obj)->obval;
-        return Dragon4_Positional_AnySize(&x, sizeof(npy_half),
-                                          digit_mode, cutoff_mode, precision,
-                                          sign, trim, pad_left, pad_right);
+        return Dragon4_Positional_Half_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, Float)) {
         npy_float x = ((PyFloatScalarObject *)obj)->obval;
-        return Dragon4_Positional_AnySize(&x, sizeof(npy_float),
-                                          digit_mode, cutoff_mode, precision,
-                                          sign, trim, pad_left, pad_right);
+        return Dragon4_Positional_Float_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, Double)) {
         npy_double x = ((PyDoubleScalarObject *)obj)->obval;
-        return Dragon4_Positional_AnySize(&x, sizeof(npy_double),
-                                          digit_mode, cutoff_mode, precision,
-                                          sign, trim, pad_left, pad_right);
+        return Dragon4_Positional_Double_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, LongDouble)) {
         npy_longdouble x = ((PyLongDoubleScalarObject *)obj)->obval;
-        return Dragon4_Positional_AnySize(&x, sizeof(npy_longdouble),
-                                          digit_mode, cutoff_mode, precision,
-                                          sign, trim, pad_left, pad_right);
+        return Dragon4_Positional_LongDouble_opt(&x, &opt);
     }
 
     val = PyFloat_AsDouble(obj);
     if (PyErr_Occurred()) {
         return NULL;
     }
-    return Dragon4_Positional_AnySize(&val, sizeof(double),
-                                      digit_mode, cutoff_mode, precision,
-                                      sign, trim, pad_left, pad_right);
-}
-
-PyObject *
-Dragon4_Scientific_AnySize(void *val, size_t size, DigitMode digit_mode,
-                           int precision, int sign, TrimMode trim,
-                           int pad_left, int exp_digits)
-{
-    /* use a very large buffer in case anyone tries to output a large precision */
-    static char repr[4096];
-    FloatVal128 val128;
-#ifdef NPY_FLOAT80
-    FloatUnion80 buf80;;
-#endif
-#ifdef NPY_FLOAT96
-    FloatUnion96 buf96;
-#endif
-#ifdef NPY_FLOAT128
-    FloatUnion128 buf128;
-#endif
-
-    /* dummy, is ignored in scientific mode */
-    CutoffMode cutoff_mode = CutoffMode_TotalLength;
-
-    switch (size) {
-        case 2:
-            Dragon4_PrintFloat16(repr, sizeof(repr), *(npy_float16*)val,
-                                 1, digit_mode, cutoff_mode, precision, sign,
-                                 trim, pad_left, -1, exp_digits);
-            break;
-        case 4:
-            Dragon4_PrintFloat32(repr, sizeof(repr), *(npy_float32*)val,
-                                 1, digit_mode, cutoff_mode, precision, sign,
-                                 trim, pad_left, -1, exp_digits);
-            break;
-        case 8:
-            Dragon4_PrintFloat64(repr, sizeof(repr), *(npy_float64*)val,
-                                 1, digit_mode, cutoff_mode, precision, sign,
-                                 trim, pad_left, -1, exp_digits);
-            break;
-#ifdef NPY_FLOAT80
-        case 10:
-            buf80.floatingPoint = *(npy_float80*)val;
-            val128.integer[0] = buf80.integer.a;
-            val128.integer[1] = buf80.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  1, digit_mode, cutoff_mode, precision, sign,
-                                  trim, pad_left, -1, exp_digits);
-            break;
-#endif
-#ifdef NPY_FLOAT96
-        case 12:
-            buf96.floatingPoint = *(npy_float96*)val;
-            val128.integer[0] = buf96.integer.a;
-            val128.integer[1] = buf96.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  1, digit_mode, cutoff_mode, precision, sign,
-                                  trim, pad_left, -1, exp_digits);
-            break;
-#endif
-#ifdef NPY_FLOAT128
-        case 16:
-            buf128.floatingPoint = *(npy_float128*)val;
-            val128.integer[0] = buf128.integer.a;
-            val128.integer[1] = buf128.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  1, digit_mode, cutoff_mode, precision, sign,
-                                  trim, pad_left, -1, exp_digits);
-            break;
-#endif
-        default:
-            PyErr_Format(PyExc_ValueError, "unexpected itemsize %zu", size);
-            return NULL;
-    }
-
-    return PyUString_FromString(repr);
+    return Dragon4_Positional_Double_opt(&val, &opt);
 }
 
 PyObject *
 Dragon4_Scientific(PyObject *obj, DigitMode digit_mode, int precision,
                    int sign, TrimMode trim, int pad_left, int exp_digits)
 {
-    double val;
+    npy_double val;
+    Dragon4_Options opt;
+
+    opt.scientific = 1;
+    opt.digit_mode = digit_mode;
+    opt.cutoff_mode = CutoffMode_TotalLength;
+    opt.precision = precision;
+    opt.sign = sign;
+    opt.trim_mode = trim;
+    opt.digits_left = pad_left;
+    opt.digits_right = -1;
+    opt.exp_digits = exp_digits;
 
     if (PyArray_IsScalar(obj, Half)) {
         npy_half x = ((PyHalfScalarObject *)obj)->obval;
-        return Dragon4_Scientific_AnySize(&x, sizeof(npy_half),
-                                          digit_mode, precision,
-                                          sign, trim, pad_left, exp_digits);
+        return Dragon4_Scientific_Half_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, Float)) {
         npy_float x = ((PyFloatScalarObject *)obj)->obval;
-        return Dragon4_Scientific_AnySize(&x, sizeof(npy_float),
-                                          digit_mode, precision,
-                                          sign, trim, pad_left, exp_digits);
+        return Dragon4_Scientific_Float_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, Double)) {
         npy_double x = ((PyDoubleScalarObject *)obj)->obval;
-        return Dragon4_Scientific_AnySize(&x, sizeof(npy_double),
-                                          digit_mode, precision,
-                                          sign, trim, pad_left, exp_digits);
+        return Dragon4_Scientific_Double_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, LongDouble)) {
         npy_longdouble x = ((PyLongDoubleScalarObject *)obj)->obval;
-        return Dragon4_Scientific_AnySize(&x, sizeof(npy_longdouble),
-                                          digit_mode, precision,
-                                          sign, trim, pad_left, exp_digits);
+        return Dragon4_Scientific_LongDouble_opt(&x, &opt);
     }
 
     val = PyFloat_AsDouble(obj);
     if (PyErr_Occurred()) {
         return NULL;
     }
-    return Dragon4_Scientific_AnySize(&val, sizeof(double),
-                                      digit_mode, precision,
-                                      sign, trim, pad_left, exp_digits);
+    return Dragon4_Scientific_Double_opt(&val, &opt);
 }
+
+#undef DEBUG_ASSERT
diff --git a/numpy/core/src/multiarray/dragon4.h b/numpy/core/src/multiarray/dragon4.h
index 5559c5157..383a0949d 100644
--- a/numpy/core/src/multiarray/dragon4.h
+++ b/numpy/core/src/multiarray/dragon4.h
@@ -40,6 +40,49 @@
 #include "npy_pycompat.h"
 #include "numpy/arrayscalars.h"
 
+/* Half binary format */
+#define NPY_HALF_BINFMT_NAME IEEE_binary16
+
+/* Float binary format */
+#if NPY_BITSOF_FLOAT == 32
+    #define NPY_FLOAT_BINFMT_NAME IEEE_binary32
+#elif NPY_BITSOF_FLOAT == 64
+    #define NPY_FLOAT_BINFMT_NAME IEEE_binary64
+#else
+    #error No float representation defined
+#endif
+
+/* Double binary format */
+#if NPY_BITSOF_DOUBLE == 32
+    #define NPY_DOUBLE_BINFMT_NAME IEEE_binary32
+#elif NPY_BITSOF_DOUBLE == 64
+    #define NPY_DOUBLE_BINFMT_NAME IEEE_binary64
+#else
+    #error No double representation defined
+#endif
+
+/* LongDouble binary format */
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_BE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME IEEE_binary128_be
+#elif defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME IEEE_binary128_le
+#elif (defined(HAVE_LDOUBLE_IEEE_DOUBLE_LE) || \
+       defined(HAVE_LDOUBLE_IEEE_DOUBLE_BE))
+    #define NPY_LONGDOUBLE_BINFMT_NAME IEEE_binary64
+#elif defined(HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME Intel_extended96
+#elif defined(HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME Intel_extended128
+#elif defined(HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME Motorola_extended96
+#elif defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME IBM_double_double_le
+#elif defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME IBM_double_double_be
+#else
+    #error No long double representation defined
+#endif
+
 typedef enum DigitMode
 {
     /* Round digits to print shortest uniquely identifiable number. */
@@ -64,15 +107,23 @@ typedef enum TrimMode
     TrimMode_DptZeros,     /* trim trailing zeros & trailing decimal point */
 } TrimMode;
 
-PyObject *
-Dragon4_Positional_AnySize(void *val, size_t size, DigitMode digit_mode,
-                           CutoffMode cutoff_mode, int precision, int sign,
-                           TrimMode trim, int pad_left, int pad_right);
+#define make_dragon4_typedecl(Type, npy_type) \
+    PyObject *\
+    Dragon4_Positional_##Type(npy_type *val, DigitMode digit_mode,\
+                              CutoffMode cutoff_mode, int precision,\
+                              int sign, TrimMode trim, int pad_left,\
+                              int pad_right);\
+    PyObject *\
+    Dragon4_Scientific_##Type(npy_type *val, DigitMode digit_mode,\
+                              int precision, int sign, TrimMode trim,\
+                              int pad_left, int exp_digits);
 
-PyObject *
-Dragon4_Scientific_AnySize(void *val, size_t size, DigitMode digit_mode,
-                           int precision, int sign, TrimMode trim,
-                           int pad_left, int pad_right);
+make_dragon4_typedecl(Half, npy_half)
+make_dragon4_typedecl(Float, npy_float)
+make_dragon4_typedecl(Double, npy_double)
+make_dragon4_typedecl(LongDouble, npy_longdouble)
+
+#undef make_dragon4_typedecl
 
 PyObject *
 Dragon4_Positional(PyObject *obj, DigitMode digit_mode, CutoffMode cutoff_mode,
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 9c27255aa..9f9aa6757 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -3400,6 +3400,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
 {
     npy_intp src_itemsize, dst_itemsize;
     int src_type_num, dst_type_num;
+    int is_builtin;
 
 #if NPY_DT_DBG_TRACING
     printf("Calculating dtype transfer from ");
@@ -3439,6 +3440,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
     dst_itemsize = dst_dtype->elsize;
     src_type_num = src_dtype->type_num;
     dst_type_num = dst_dtype->type_num;
+    is_builtin = src_type_num < NPY_NTYPES && dst_type_num < NPY_NTYPES;
 
     /* Common special case - number -> number NBO cast */
     if (PyTypeNum_ISNUMBER(src_type_num) &&
@@ -3462,13 +3464,14 @@ PyArray_GetDTypeTransferFunction(int aligned,
     }
 
     /*
-     * If there are no references and the data types are equivalent,
+     * If there are no references and the data types are equivalent and builtin,
      * return a simple copy
      */
     if (PyArray_EquivTypes(src_dtype, dst_dtype) &&
             !PyDataType_REFCHK(src_dtype) && !PyDataType_REFCHK(dst_dtype) &&
             ( !PyDataType_HASFIELDS(dst_dtype) ||
-              is_dtype_struct_simple_unaligned_layout(dst_dtype)) ) {
+              is_dtype_struct_simple_unaligned_layout(dst_dtype)) &&
+            is_builtin) {
         /*
          * We can't pass through the aligned flag because it's not
          * appropriate. Consider a size-8 string, it will say it's
@@ -3494,7 +3497,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
                 !PyDataType_HASSUBARRAY(dst_dtype) &&
                 src_type_num != NPY_DATETIME && src_type_num != NPY_TIMEDELTA) {
         /* A custom data type requires that we use its copy/swap */
-        if (src_type_num >= NPY_NTYPES || dst_type_num >= NPY_NTYPES) {
+        if (!is_builtin) {
             /*
              * If the sizes and kinds are identical, but they're different
              * custom types, then get a cast function
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 5dbc30aa9..3c086351f 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -591,7 +591,7 @@ finish_after_unrolled_loop:
             accum += @from@(data0[@i@]) * @from@(data1[@i@]);
 /**end repeat2**/
         case 0:
-            *(@type@ *)dataptr[2] += @to@(accum);
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
             return;
     }
 
@@ -749,7 +749,7 @@ finish_after_unrolled_loop:
             accum += @from@(data1[@i@]);
 /**end repeat2**/
         case 0:
-            *(@type@ *)dataptr[2] += @to@(value0 * accum);
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
             return;
     }
 
@@ -848,7 +848,7 @@ finish_after_unrolled_loop:
             accum += @from@(data0[@i@]);
 /**end repeat2**/
         case 0:
-            *(@type@ *)dataptr[2] += @to@(accum * value1);
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
             return;
     }
 
@@ -1776,138 +1776,94 @@ get_sum_of_products_function(int nop, int type_num,
     return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
 }
 
+
 /*
- * Parses the subscripts for one operand into an output
- * of 'ndim' labels
+ * Parses the subscripts for one operand into an output of 'ndim'
+ * labels. The resulting 'op_labels' array will have:
+ *  - the ASCII code of the label for the first occurrence of a label;
+ *  - the (negative) offset to the first occurrence of the label for
+ *    repeated labels;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
+ * For example:
+ *  - subscripts="abbcbc",  ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2]
+ *  - subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99]
  */
+
 static int
 parse_operand_subscripts(char *subscripts, int length,
-                        int ndim,
-                        int iop, char *out_labels,
-                        char *out_label_counts,
-                        int *out_min_label,
-                        int *out_max_label,
-                        int *out_num_labels)
+                         int ndim, int iop, char *op_labels,
+                         char *label_counts, int *min_label, int *max_label)
 {
-    int i, idim, ndim_left, label;
-    int ellipsis = 0;
+    int i;
+    int idim = 0;
+    int ellipsis = -1;
 
-    /* Process the labels from the end until the ellipsis */
-    idim = ndim-1;
-    for (i = length-1; i >= 0; --i) {
-        label = subscripts[i];
-        /* A label for an axis */
+    /* Process all labels for this operand */
+    for (i = 0; i < length; ++i) {
+        int label = subscripts[i];
+
+        /* A proper label for an axis. */
         if (label > 0 && isalpha(label)) {
-            if (idim >= 0) {
-                out_labels[idim--] = label;
-                /* Calculate the min and max labels */
-                if (label < *out_min_label) {
-                    *out_min_label = label;
-                }
-                if (label > *out_max_label) {
-                    *out_max_label = label;
-                }
-                /* If it's the first time we see this label, count it */
-                if (out_label_counts[label] == 0) {
-                    (*out_num_labels)++;
-                }
-                out_label_counts[label]++;
-            }
-            else {
+            /* Check we don't exceed the operator dimensions. */
+            if (idim >= ndim) {
                 PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains "
-                            "too many subscripts for operand %d", iop);
-                return 0;
+                             "einstein sum subscripts string contains "
+                             "too many subscripts for operand %d", iop);
+                return -1;
+            }
+
+            op_labels[idim++] = label;
+            if (label < *min_label) {
+                *min_label = label;
+            }
+            if (label > *max_label) {
+                *max_label = label;
             }
+            label_counts[label]++;
         }
-        /* The end of the ellipsis */
+        /* The beginning of the ellipsis. */
         else if (label == '.') {
-            /* A valid ellipsis */
-            if (i >= 2 && subscripts[i-1] == '.' && subscripts[i-2] == '.') {
-                ellipsis = 1;
-                length = i-2;
-                break;
-            }
-            else {
+            /* Check it's a proper ellipsis. */
+            if (ellipsis != -1 || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
                 PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') in "
-                            "operand %d", iop);
-                return 0;
-
+                             "einstein sum subscripts string contains a "
+                             "'.' that is not part of an ellipsis ('...') "
+                             "in operand %d", iop);
+                return -1;
             }
+
+            ellipsis = idim;
         }
         else if (label != ' ') {
             PyErr_Format(PyExc_ValueError,
-                        "invalid subscript '%c' in einstein sum "
-                        "subscripts string, subscripts must "
-                        "be letters", (char)label);
-            return 0;
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
         }
     }
 
-    if (!ellipsis && idim != -1) {
-        PyErr_Format(PyExc_ValueError,
-                    "operand has more dimensions than subscripts "
-                    "given in einstein sum, but no '...' ellipsis "
-                    "provided to broadcast the extra dimensions.");
-        return 0;
-    }
-
-    /* Reduce ndim to just the dimensions left to fill at the beginning */
-    ndim_left = idim+1;
-    idim = 0;
-
-    /*
-     * If we stopped because of an ellipsis, start again from the beginning.
-     * The length was truncated to end at the ellipsis in this case.
-     */
-    if (i > 0) {
-        for (i = 0; i < length; ++i) {
-            label = subscripts[i];
-            /* A label for an axis */
-            if (label > 0 && isalnum(label)) {
-                if (idim < ndim_left) {
-                    out_labels[idim++] = label;
-                    /* Calculate the min and max labels */
-                    if (label < *out_min_label) {
-                        *out_min_label = label;
-                    }
-                    if (label > *out_max_label) {
-                        *out_max_label = label;
-                    }
-                    /* If it's the first time we see this label, count it */
-                    if (out_label_counts[label] == 0) {
-                        (*out_num_labels)++;
-                    }
-                    out_label_counts[label]++;
-                }
-                else {
-                    PyErr_Format(PyExc_ValueError,
-                                "einstein sum subscripts string contains "
-                                "too many subscripts for operand %d", iop);
-                    return 0;
-                }
-            }
-            else if (label == '.') {
-                PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') in "
-                            "operand %d", iop);
-            }
-            else if (label != ' ') {
-                PyErr_Format(PyExc_ValueError,
-                            "invalid subscript '%c' in einstein sum "
-                            "subscripts string, subscripts must "
-                            "be letters", (char)label);
-                return 0;
-            }
+    /* No ellipsis found, labels must match dimensions exactly. */
+    if (ellipsis == -1) {
+        if (idim != ndim) {
+            PyErr_Format(PyExc_ValueError,
+                         "operand has more dimensions than subscripts "
+                         "given in einstein sum, but no '...' ellipsis "
+                         "provided to broadcast the extra dimensions.");
+            return -1;
         }
     }
-
-    /* Set the remaining labels to 0 */
-    while (idim < ndim_left) {
-        out_labels[idim++] = 0;
+    /* Ellipsis found, may have to add broadcast dimensions. */
+    else if (idim < ndim) {
+        /* Move labels after ellipsis to the end. */
+        for (i = 0; i < idim - ellipsis; ++i) {
+            op_labels[ndim - i - 1] = op_labels[idim - i - 1];
+        }
+        /* Set all broadcast dimensions to zero. */
+        for (i = 0; i < ndim - idim; ++i) {
+            op_labels[ellipsis + i] = 0;
+        }
     }
 
     /*
@@ -1918,158 +1874,116 @@ parse_operand_subscripts(char *subscripts, int length,
      * twos complement arithmetic the char is ok either way here, and
      * later where it matters the char is cast to a signed char.
      */
-    for (idim = 0; idim  < ndim-1; ++idim) {
-        char *next;
-        /* If this is a proper label, find any duplicates of it */
-        label = out_labels[idim];
+    for (idim = 0; idim < ndim - 1; ++idim) {
+        int label = op_labels[idim];
+        /* If it is a proper label, find any duplicates of it. */
         if (label > 0) {
-            /* Search for the next matching label */
-            next = (char *)memchr(out_labels+idim+1, label,
-                                    ndim-idim-1);
+            /* Search for the next matching label. */
+            char *next = memchr(op_labels + idim + 1, label, ndim - idim - 1);
+
             while (next != NULL) {
-                /* The offset from next to out_labels[idim] (negative) */
-                *next = (char)((out_labels+idim)-next);
-                /* Search for the next matching label */
-                next = (char *)memchr(next+1, label,
-                                        out_labels+ndim-1-next);
+                /* The offset from next to op_labels[idim] (negative). */
+                *next = (char)((op_labels + idim) - next);
+                /* Search for the next matching label. */
+                next = memchr(next + 1, label, op_labels + ndim - 1 - next);
             }
         }
     }
 
-    return 1;
+    return 0;
 }
 
+
 /*
- * Parses the subscripts for the output operand into an output
- * that requires 'ndim_broadcast' unlabeled dimensions, returning
- * the number of output dimensions.  Returns -1 if there is an error.
+ * Parses the subscripts for the output operand into an output that
+ * includes 'ndim_broadcast' unlabeled dimensions, and returns the total
+ * number of output dimensions, or -1 if there is an error. Similarly
+ * to parse_operand_subscripts, the 'out_labels' array will have, for
+ * each dimension:
+ *  - the ASCII code of the corresponding label;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
  */
 static int
 parse_output_subscripts(char *subscripts, int length,
                         int ndim_broadcast,
-                        const char *label_counts,
-                        char *out_labels)
+                        const char *label_counts, char *out_labels)
 {
-    int i, nlabels, label, idim, ndim, ndim_left;
+    int i, bdim;
+    int ndim = 0;
     int ellipsis = 0;
 
-    /* Count the labels, making sure they're all unique and valid */
-    nlabels = 0;
+    /* Process all the output labels. */
     for (i = 0; i < length; ++i) {
-        label = subscripts[i];
-        if (label > 0 && isalpha(label)) {
-            /* Check if it occurs again */
-            if (memchr(subscripts+i+1, label, length-i-1) == NULL) {
-                /* Check that it was used in the inputs */
-                if (label_counts[label] == 0) {
-                    PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string included "
-                            "output subscript '%c' which never appeared "
-                            "in an input", (char)label);
-                    return -1;
-                }
+        int label = subscripts[i];
 
-                nlabels++;
-            }
-            else {
+        /* A proper label for an axis. */
+        if (label > 0 && isalpha(label)) {
+            /* Check that it doesn't occur again. */
+            if (memchr(subscripts + i + 1, label, length - i - 1) != NULL) {
                 PyErr_Format(PyExc_ValueError,
-                        "einstein sum subscripts string includes "
-                        "output subscript '%c' multiple times",
-                        (char)label);
+                             "einstein sum subscripts string includes "
+                             "output subscript '%c' multiple times",
+                             (char)label);
                 return -1;
             }
-        }
-        else if (label != '.' && label != ' ') {
-            PyErr_Format(PyExc_ValueError,
-                        "invalid subscript '%c' in einstein sum "
-                        "subscripts string, subscripts must "
-                        "be letters", (char)label);
-            return -1;
-        }
-    }
-
-    /* The number of output dimensions */
-    ndim = ndim_broadcast + nlabels;
-
-    /* Process the labels from the end until the ellipsis */
-    idim = ndim-1;
-    for (i = length-1; i >= 0; --i) {
-        label = subscripts[i];
-        /* A label for an axis */
-        if (label != '.' && label != ' ') {
-            if (idim >= 0) {
-                out_labels[idim--] = label;
+            /* Check that it was used in the inputs. */
+            if (label_counts[label] == 0) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string included "
+                             "output subscript '%c' which never appeared "
+                             "in an input", (char)label);
+                return -1;
             }
-            else {
+            /* Check that there is room in out_labels for this label. */
+            if (ndim >= NPY_MAXDIMS) {
                 PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains "
-                            "too many output subscripts");
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
                 return -1;
             }
+
+            out_labels[ndim++] = label;
         }
-        /* The end of the ellipsis */
+        /* The beginning of the ellipsis. */
         else if (label == '.') {
-            /* A valid ellipsis */
-            if (i >= 2 && subscripts[i-1] == '.' && subscripts[i-2] == '.') {
-                ellipsis = 1;
-                length = i-2;
-                break;
-            }
-            else {
+            /* Check it is a proper ellipsis. */
+            if (ellipsis || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
                 PyErr_SetString(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') "
-                            "in the output");
+                                "einstein sum subscripts string "
+                                "contains a '.' that is not part of "
+                                "an ellipsis ('...') in the output");
                 return -1;
-
             }
-        }
-    }
-
-    if (!ellipsis && idim != -1) {
-        PyErr_SetString(PyExc_ValueError,
-                    "output has more dimensions than subscripts "
-                    "given in einstein sum, but no '...' ellipsis "
-                    "provided to broadcast the extra dimensions.");
-        return 0;
-    }
-
-    /* Reduce ndim to just the dimensions left to fill at the beginning */
-    ndim_left = idim+1;
-    idim = 0;
-
-    /*
-     * If we stopped because of an ellipsis, start again from the beginning.
-     * The length was truncated to end at the ellipsis in this case.
-     */
-    if (i > 0) {
-        for (i = 0; i < length; ++i) {
-            label = subscripts[i];
-            if (label == '.') {
-                PyErr_SetString(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') "
-                            "in the output");
+            /* Check there is room in out_labels for broadcast dims. */
+            if (ndim + ndim_broadcast > NPY_MAXDIMS) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
                 return -1;
             }
-            /* A label for an axis */
-            else if (label != ' ') {
-                if (idim < ndim_left) {
-                    out_labels[idim++] = label;
-                }
-                else {
-                    PyErr_Format(PyExc_ValueError,
-                                "einstein sum subscripts string contains "
-                                "too many subscripts for the output");
-                    return -1;
-                }
+
+            ellipsis = 1;
+            for (bdim = 0; bdim < ndim_broadcast; ++bdim) {
+                out_labels[ndim++] = 0;
             }
         }
+        else if (label != ' ') {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
+        }
     }
 
-    /* Set the remaining output labels to 0 */
-    while (idim < ndim_left) {
-        out_labels[idim++] = 0;
+    /* If no ellipsis was found there should be no broadcast dimensions. */
+    if (!ellipsis && ndim_broadcast > 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "output has more dimensions than subscripts "
+                        "given in einstein sum, but no '...' ellipsis "
+                        "provided to broadcast the extra dimensions.");
+        return -1;
     }
 
     return ndim;
@@ -2121,7 +2035,7 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
             if (ibroadcast == ndim_output) {
                 PyErr_SetString(PyExc_ValueError,
                         "output had too few broadcast dimensions");
-                return 0;
+                return -1;
             }
             new_dims[ibroadcast] = PyArray_DIM(op, idim);
             new_strides[ibroadcast] = PyArray_STRIDE(op, idim);
@@ -2144,7 +2058,7 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
                         "index '%c' don't match (%d != %d)",
                         iop, label, (int)new_dims[i],
                         (int)PyArray_DIM(op, idim));
-                return 0;
+                return -1;
             }
             new_dims[i] = PyArray_DIM(op, idim);
             new_strides[i] += PyArray_STRIDE(op, idim);
@@ -2162,31 +2076,20 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
                                 (PyObject *)op);
 
         if (*ret == NULL) {
-            return 0;
+            return -1;
         }
-        if (!PyArray_Check(*ret)) {
-            Py_DECREF(*ret);
-            *ret = NULL;
-            PyErr_SetString(PyExc_RuntimeError,
-                        "NewFromDescr failed to return an array");
-            return 0;
-        }
-        PyArray_UpdateFlags(*ret,
-                    NPY_ARRAY_C_CONTIGUOUS|
-                    NPY_ARRAY_ALIGNED|
-                    NPY_ARRAY_F_CONTIGUOUS);
         Py_INCREF(op);
         if (PyArray_SetBaseObject(*ret, (PyObject *)op) < 0) {
             Py_DECREF(*ret);
             *ret = NULL;
-            return 0;
+            return -1;
         }
-        return 1;
+        return 0;
     }
 
     /* Return success, but that we couldn't make a view */
     *ret = NULL;
-    return 1;
+    return 0;
 }
 
 static PyArrayObject *
@@ -2269,16 +2172,6 @@ get_combined_dims_view(PyArrayObject *op, int iop, char *labels)
     if (ret == NULL) {
         return NULL;
     }
-    if (!PyArray_Check(ret)) {
-        Py_DECREF(ret);
-        PyErr_SetString(PyExc_RuntimeError,
-                    "NewFromDescr failed to return an array");
-        return NULL;
-    }
-    PyArray_UpdateFlags(ret,
-                NPY_ARRAY_C_CONTIGUOUS|
-                NPY_ARRAY_ALIGNED|
-                NPY_ARRAY_F_CONTIGUOUS);
     Py_INCREF(op);
     if (PyArray_SetBaseObject(ret, (PyObject *)op) < 0) {
         Py_DECREF(ret);
@@ -2332,7 +2225,7 @@ prepare_op_axes(int ndim, int iop, char *labels, int *axes,
         }
     }
 
-    return 1;
+    return 0;
 }
 
 static int
@@ -2613,7 +2506,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
                     NPY_ORDER order, NPY_CASTING casting,
                     PyArrayObject *out)
 {
-    int iop, label, min_label = 127, max_label = 0, num_labels;
+    int iop, label, min_label = 127, max_label = 0;
     char label_counts[128];
     char op_labels[NPY_MAXARGS][NPY_MAXDIMS];
     char output_labels[NPY_MAXDIMS], *iter_labels;
@@ -2644,7 +2537,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
 
     /* Parse the subscripts string into label_counts and op_labels */
     memset(label_counts, 0, sizeof(label_counts));
-    num_labels = 0;
     for (iop = 0; iop < nop; ++iop) {
         int length = (int)strcspn(subscripts, ",-");
 
@@ -2661,10 +2553,10 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
             return NULL;
         }
 
-        if (!parse_operand_subscripts(subscripts, length,
+        if (parse_operand_subscripts(subscripts, length,
                         PyArray_NDIM(op_in[iop]),
                         iop, op_labels[iop], label_counts,
-                        &min_label, &max_label, &num_labels)) {
+                        &min_label, &max_label) < 0) {
             return NULL;
         }
 
@@ -2698,21 +2590,18 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     }
 
     /*
-     * If there is no output signature, create one using each label
-     * that appeared once, in alphabetical order
+     * If there is no output signature, fill output_labels and ndim_output
+     * using each label that appeared once, in alphabetical order.
      */
     if (subscripts[0] == '\0') {
-        char outsubscripts[NPY_MAXDIMS + 3];
-        int length;
-        /* If no output was specified, always broadcast left (like normal) */
-        outsubscripts[0] = '.';
-        outsubscripts[1] = '.';
-        outsubscripts[2] = '.';
-        length = 3;
+        /* If no output was specified, always broadcast left, as usual. */
+        for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) {
+            output_labels[ndim_output] = 0;
+        }
         for (label = min_label; label <= max_label; ++label) {
             if (label_counts[label] == 1) {
-                if (length < NPY_MAXDIMS-1) {
-                    outsubscripts[length++] = label;
+                if (ndim_output < NPY_MAXDIMS) {
+                    output_labels[ndim_output++] = label;
                 }
                 else {
                     PyErr_SetString(PyExc_ValueError,
@@ -2722,10 +2611,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
                 }
             }
         }
-        /* Parse the output subscript string */
-        ndim_output = parse_output_subscripts(outsubscripts, length,
-                                        ndim_broadcast, label_counts,
-                                        output_labels);
     }
     else {
         if (subscripts[0] != '-' || subscripts[1] != '>') {
@@ -2736,13 +2621,13 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         }
         subscripts += 2;
 
-        /* Parse the output subscript string */
+        /* Parse the output subscript string. */
         ndim_output = parse_output_subscripts(subscripts, strlen(subscripts),
                                         ndim_broadcast, label_counts,
                                         output_labels);
-    }
-    if (ndim_output < 0) {
-        return NULL;
+        if (ndim_output < 0) {
+            return NULL;
+        }
     }
 
     if (out != NULL && PyArray_NDIM(out) != ndim_output) {
@@ -2776,9 +2661,9 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         if (iop == 0 && nop == 1 && out == NULL) {
             ret = NULL;
 
-            if (!get_single_op_view(op_in[iop], iop, labels,
-                                    ndim_output, output_labels,
-                                    &ret)) {
+            if (get_single_op_view(op_in[iop], iop, labels,
+                                   ndim_output, output_labels,
+                                   &ret) < 0) {
                 return NULL;
             }
 
@@ -2840,8 +2725,8 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     for (iop = 0; iop < nop; ++iop) {
         op_axes[iop] = op_axes_arrays[iop];
 
-        if (!prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
-                    op_axes[iop], ndim_iter, iter_labels)) {
+        if (prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
+                    op_axes[iop], ndim_iter, iter_labels) < 0) {
             goto fail;
         }
     }
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index 86e6e7a2f..d86f90a53 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -758,7 +758,6 @@ _get_part(PyArrayObject *self, int imag)
         Py_DECREF(ret);
         return NULL;
     }
-    PyArray_CLEARFLAGS(ret, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
     return ret;
 }
 
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index eb9ef5915..d010b2e75 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -955,9 +955,10 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 
     NPY_BEGIN_THREADS_DEF;
 
-    rop = (PyArrayObject *)PyArray_New(Py_TYPE(op), PyArray_NDIM(op),
-                                       PyArray_DIMS(op), NPY_INTP,
-                                       NULL, NULL, 0, 0, (PyObject *)op);
+    rop = (PyArrayObject *)PyArray_NewFromDescr(
+            Py_TYPE(op), PyArray_DescrFromType(NPY_INTP),
+            PyArray_NDIM(op), PyArray_DIMS(op), NULL, NULL,
+            0, (PyObject *)op);
     if (rop == NULL) {
         return NULL;
     }
@@ -1439,10 +1440,10 @@ PyArray_LexSort(PyObject *sort_keys, int axis)
     nd = PyArray_NDIM(mps[0]);
     if ((nd == 0) || (PyArray_SIZE(mps[0]) == 1)) {
         /* single element case */
-        ret = (PyArrayObject *)PyArray_New(&PyArray_Type, PyArray_NDIM(mps[0]),
-                                           PyArray_DIMS(mps[0]),
-                                           NPY_INTP,
-                                           NULL, NULL, 0, 0, NULL);
+        ret = (PyArrayObject *)PyArray_NewFromDescr(
+            &PyArray_Type, PyArray_DescrFromType(NPY_INTP),
+            PyArray_NDIM(mps[0]), PyArray_DIMS(mps[0]), NULL, NULL,
+            0, NULL);
 
         if (ret == NULL) {
             goto fail;
@@ -1463,9 +1464,10 @@ PyArray_LexSort(PyObject *sort_keys, int axis)
     }
 
     /* Now do the sorting */
-    ret = (PyArrayObject *)PyArray_New(&PyArray_Type, PyArray_NDIM(mps[0]),
-                                       PyArray_DIMS(mps[0]), NPY_INTP,
-                                       NULL, NULL, 0, 0, NULL);
+    ret = (PyArrayObject *)PyArray_NewFromDescr(
+            &PyArray_Type, PyArray_DescrFromType(NPY_INTP),
+            PyArray_NDIM(mps[0]), PyArray_DIMS(mps[0]), NULL, NULL,
+            0, NULL);
     if (ret == NULL) {
         goto fail;
     }
@@ -1737,9 +1739,10 @@ PyArray_SearchSorted(PyArrayObject *op1, PyObject *op2,
     }
 
     /* ret is a contiguous array of intp type to hold returned indexes */
-    ret = (PyArrayObject *)PyArray_New(&PyArray_Type, PyArray_NDIM(ap2),
-                                       PyArray_DIMS(ap2), NPY_INTP,
-                                       NULL, NULL, 0, 0, (PyObject *)ap2);
+    ret = (PyArrayObject *)PyArray_NewFromDescr(
+            &PyArray_Type, PyArray_DescrFromType(NPY_INTP),
+            PyArray_NDIM(ap2), PyArray_DIMS(ap2), NULL, NULL,
+            0, (PyObject *)ap2);
     if (ret == NULL) {
         goto fail;
     }
@@ -2207,9 +2210,10 @@ PyArray_Nonzero(PyArrayObject *self)
     /* Allocate the result as a 2D array */
     ret_dims[0] = nonzero_count;
     ret_dims[1] = (ndim == 0) ? 1 : ndim;
-    ret = (PyArrayObject *)PyArray_New(&PyArray_Type, 2, ret_dims,
-                       NPY_INTP, NULL, NULL, 0, 0,
-                       NULL);
+    ret = (PyArrayObject *)PyArray_NewFromDescr(
+            &PyArray_Type, PyArray_DescrFromType(NPY_INTP),
+            2, ret_dims, NULL, NULL,
+            0, NULL);
     if (ret == NULL) {
         return NULL;
     }
@@ -2361,10 +2365,10 @@ finish:
         /* the result is an empty array, the view must point to valid memory */
         npy_intp data_offset = is_empty ? 0 : i * NPY_SIZEOF_INTP;
 
-        PyArrayObject *view = (PyArrayObject *)PyArray_New(Py_TYPE(ret), 1,
-                                    &nonzero_count, NPY_INTP, &stride,
-                                    PyArray_BYTES(ret) + data_offset,
-                                    0, PyArray_FLAGS(ret), (PyObject *)ret);
+        PyArrayObject *view = (PyArrayObject *)PyArray_NewFromDescr(
+            Py_TYPE(ret), PyArray_DescrFromType(NPY_INTP),
+            1, &nonzero_count, &stride, PyArray_BYTES(ret) + data_offset,
+            PyArray_FLAGS(ret), (PyObject *)ret);
         if (view == NULL) {
             Py_DECREF(ret);
             Py_DECREF(ret_tuple);
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 397aaf209..fa68af19a 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -1373,7 +1373,7 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
 /*
  * Advanded indexing iteration of arrays when there is a single indexing
  * array which has the same memory order as the value array and both
- * can be trivally iterated (single stride, aligned, no casting necessary).
+ * can be trivially iterated (single stride, aligned, no casting necessary).
  */
 NPY_NO_EXPORT int
 mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
@@ -1747,7 +1747,7 @@ mapiter_@name@(PyArrayMapIterObject *mit)
                 }
                 else {
                     /*
-                     * faster resetting if the subspace iteration is trival.
+                     * faster resetting if the subspace iteration is trivial.
                      * reset_offsets are zero for positive strides,
                      * for negative strides this shifts the pointer to the last
                      * item.
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 4b2c6aa5a..f2782ff27 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -293,8 +293,7 @@ unpack_indices(PyObject *index, PyObject **result, npy_intp result_n)
         if (commit_to_unpack) {
             /* propagate errors */
             if (tmp_obj == NULL) {
-                multi_DECREF(result, i);
-                return -1;
+                goto fail;
             }
         }
         else {
@@ -313,6 +312,16 @@ unpack_indices(PyObject *index, PyObject **result, npy_intp result_n)
                     || PySlice_Check(tmp_obj)
                     || tmp_obj == Py_Ellipsis
                     || tmp_obj == Py_None) {
+                if (DEPRECATE_FUTUREWARNING(
+                        "Using a non-tuple sequence for multidimensional "
+                        "indexing is deprecated; use `arr[tuple(seq)]` "
+                        "instead of `arr[seq]`. In the future this will be "
+                        "interpreted as an array index, `arr[np.array(seq)]`, "
+                        "which will result either in an error or a different "
+                        "result.") < 0) {
+                    i++;  /* since loop update doesn't run */
+                    goto fail;
+                }
                 commit_to_unpack = 1;
             }
         }
@@ -328,6 +337,10 @@ unpack_indices(PyObject *index, PyObject **result, npy_intp result_n)
         multi_DECREF(result, i);
         return unpack_scalar(index, result, result_n);
     }
+
+fail:
+    multi_DECREF(result, i);
+    return -1;
 }
 
 /**
@@ -2202,9 +2215,10 @@ _nonzero_indices(PyObject *myBool, PyArrayObject **arrays)
 
     /* create count-sized index arrays for each dimension */
     for (j = 0; j < nd; j++) {
-        new = (PyArrayObject *)PyArray_New(&PyArray_Type, 1, &count,
-                                           NPY_INTP, NULL, NULL,
-                                           0, 0, NULL);
+        new = (PyArrayObject *)PyArray_NewFromDescr(
+            &PyArray_Type, PyArray_DescrFromType(NPY_INTP),
+            1, &count, NULL, NULL,
+            0, NULL);
         if (new == NULL) {
             goto fail;
         }
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 004af8a70..ed339b98d 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -388,8 +388,6 @@ PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int offset)
         Py_DECREF(ret);
         return NULL;
     }
-
-    PyArray_UpdateFlags((PyArrayObject *)ret, NPY_ARRAY_UPDATE_ALL);
     return ret;
 }
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 7eccb4a4b..896a3b07e 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -197,7 +197,7 @@ PyArray_CompareLists(npy_intp *l1, npy_intp *l2, int n)
 }
 
 /*
- * simulates a C-style 1-3 dimensional array which can be accesed using
+ * simulates a C-style 1-3 dimensional array which can be accessed using
  * ptr[i]  or ptr[i][j] or ptr[i][j][k] -- requires pointer allocation
  * for 2-d and 3-d.
  *
@@ -2652,21 +2652,31 @@ einsum_list_to_subscripts(PyObject *obj, char *subscripts, int subsize)
         /* Subscript */
         else if (PyInt_Check(item) || PyLong_Check(item)) {
             long s = PyInt_AsLong(item);
-            if ( s < 0 || s > 2*26) {
+            npy_bool bad_input = 0;
+
+            if (subindex + 1 >= subsize) {
                 PyErr_SetString(PyExc_ValueError,
-                        "subscript is not within the valid range [0, 52]");
+                        "subscripts list is too long");
                 Py_DECREF(obj);
                 return -1;
             }
-            if (s < 26) {
-                subscripts[subindex++] = 'A' + s;
+
+            if ( s < 0 ) {
+                bad_input = 1;
+            }
+            else if (s < 26) {
+                subscripts[subindex++] = 'A' + (char)s;
+            }
+            else if (s < 2*26) {
+                subscripts[subindex++] = 'a' + (char)s - 26;
             }
             else {
-                subscripts[subindex++] = 'a' + s;
+                bad_input = 1;
             }
-            if (subindex >= subsize) {
+
+            if (bad_input) {
                 PyErr_SetString(PyExc_ValueError,
-                        "subscripts list is too long");
+                        "subscript is not within the valid range [0, 52)");
                 Py_DECREF(obj);
                 return -1;
             }
@@ -3605,7 +3615,7 @@ as_buffer(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 
 
 /*
- * Prints floating-point scalars usign the Dragon4 algorithm, scientific mode.
+ * Prints floating-point scalars using the Dragon4 algorithm, scientific mode.
  * See docstring of `np.format_float_scientific` for description of arguments.
  * The differences is that a value of -1 is valid for pad_left, exp_digits,
  * precision, which is equivalent to `None`.
@@ -3661,7 +3671,7 @@ dragon4_scientific(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 }
 
 /*
- * Prints floating-point scalars usign the Dragon4 algorithm, positional mode.
+ * Prints floating-point scalars using the Dragon4 algorithm, positional mode.
  * See docstring of `np.format_float_positional` for description of arguments.
  * The differences is that a value of -1 is valid for pad_left, pad_right,
  * precision, which is equivalent to `None`.
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index 152955940..28020f79a 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -1153,8 +1153,6 @@ NpyIter_GetIterView(NpyIter *iter, npy_intp i)
         Py_DECREF(view);
         return NULL;
     }
-    /* Make sure all the flags are good */
-    PyArray_UpdateFlags(view, NPY_ARRAY_UPDATE_ALL);
 
     return view;
 }
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index c512cf208..b07137858 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -2675,9 +2675,6 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
         return NULL;
     }
 
-    /* Make sure all the flags are good */
-    PyArray_UpdateFlags(ret, NPY_ARRAY_UPDATE_ALL);
-
     /* Double-check that the subtype didn't mess with the dimensions */
     if (subtype != &PyArray_Type) {
         if (PyArray_NDIM(ret) != op_ndim ||
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 4505e645b..50a138167 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -2077,8 +2077,6 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
         return NULL;
     }
 
-    PyArray_UpdateFlags(ret, NPY_ARRAY_UPDATE_ALL);
-
     return (PyObject *)ret;
 }
 
@@ -2216,8 +2214,6 @@ npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
         return -1;
     }
 
-    PyArray_UpdateFlags(tmp, NPY_ARRAY_UPDATE_ALL);
-
     ret = PyArray_CopyObject(tmp, v);
     Py_DECREF(tmp);
     return ret;
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 915d743c8..14389a925 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -476,7 +476,9 @@ fast_scalar_power(PyArrayObject *a1, PyObject *o2, int inplace,
     double exponent;
     NPY_SCALARKIND kind;   /* NPY_NOSCALAR is not scalar */
 
-    if (PyArray_Check(a1) && ((kind=is_scalar_with_conversion(o2, &exponent))>0)) {
+    if (PyArray_Check(a1) &&
+            !PyArray_ISOBJECT(a1) &&
+            ((kind=is_scalar_with_conversion(o2, &exponent))>0)) {
         PyObject *fastop = NULL;
         if (PyArray_ISFLOAT(a1) || PyArray_ISCOMPLEX(a1)) {
             if (exponent == 1.0) {
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index cb4af0d12..25e0668ed 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -423,6 +423,7 @@ gentype_format(PyObject *self, PyObject *args)
 
 /**begin repeat
  * #name = half, float, double, longdouble#
+ * #Name = Half, Float, Double, LongDouble#
  * #NAME = HALF, FLOAT, DOUBLE, LONGDOUBLE#
  * #type = npy_half, npy_float, npy_double, npy_longdouble#
  * #suff = h, f, d, l#
@@ -434,12 +435,12 @@ format_@name@(@type@ val, npy_bool scientific,
               int pad_left, int pad_right, int exp_digits)
 {
     if (scientific) {
-        return Dragon4_Scientific_AnySize(&val, sizeof(@type@),
+        return Dragon4_Scientific_@Name@(&val,
                         DigitMode_Unique, precision,
                         sign, trim, pad_left, exp_digits);
     }
     else {
-        return Dragon4_Positional_AnySize(&val, sizeof(@type@),
+        return Dragon4_Positional_@Name@(&val,
                         DigitMode_Unique, CutoffMode_TotalLength, precision,
                         sign, trim, pad_left, pad_right);
     }
@@ -4201,7 +4202,7 @@ doubletype_print(PyObject *o, FILE *fp, int flags)
         return -1;
     }
 
-    ret = PyObject_Print(to_print, fp, flags);
+    ret = PyObject_Print(to_print, fp, Py_PRINT_RAW);
     Py_DECREF(to_print);
     return ret;
 }
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 05c24d6da..1424a69f3 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -189,7 +189,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
     npy_intp *dimensions = newdims->ptr;
     PyArrayObject *ret;
     int ndim = newdims->len;
-    npy_bool same, incref = NPY_TRUE;
+    npy_bool same;
     npy_intp *strides = NULL;
     npy_intp newstrides[NPY_MAXDIMS];
     int flags;
@@ -230,6 +230,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
      * data in the order it is in.
      * NPY_RELAXED_STRIDES_CHECKING: size check is unnecessary when set.
      */
+    Py_INCREF(self);
     if ((PyArray_SIZE(self) > 1) &&
         ((order == NPY_CORDER && !PyArray_IS_C_CONTIGUOUS(self)) ||
          (order == NPY_FORTRANORDER && !PyArray_IS_F_CONTIGUOUS(self)))) {
@@ -243,10 +244,10 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
         else {
             PyObject *newcopy;
             newcopy = PyArray_NewCopy(self, order);
+            Py_DECREF(self);
             if (newcopy == NULL) {
                 return NULL;
             }
-            incref = NPY_FALSE;
             self = (PyArrayObject *)newcopy;
         }
     }
@@ -277,21 +278,14 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
         goto fail;
     }
 
-    if (incref) {
-        Py_INCREF(self);
-    }
     if (PyArray_SetBaseObject(ret, (PyObject *)self)) {
         Py_DECREF(ret);
         return NULL;
     }
-
-    PyArray_UpdateFlags(ret, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
     return (PyObject *)ret;
 
  fail:
-    if (!incref) {
-        Py_DECREF(self);
-    }
+    Py_DECREF(self);
     return NULL;
 }
 
@@ -970,9 +964,6 @@ PyArray_Ravel(PyArrayObject *arr, NPY_ORDER order)
             if (ret == NULL) {
                 return NULL;
             }
-
-            PyArray_UpdateFlags(ret,
-                        NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_F_CONTIGUOUS);
             Py_INCREF(arr);
             if (PyArray_SetBaseObject(ret, (PyObject *)arr) < 0) {
                 Py_DECREF(ret);
diff --git a/numpy/core/src/npymath/ieee754.c.src b/numpy/core/src/npymath/ieee754.c.src
index bca690b4d..ea1792887 100644
--- a/numpy/core/src/npymath/ieee754.c.src
+++ b/numpy/core/src/npymath/ieee754.c.src
@@ -6,6 +6,7 @@
  */
 #include "npy_math_common.h"
 #include "npy_math_private.h"
+#include "numpy/utils.h"
 
 #ifndef HAVE_COPYSIGN
 double npy_copysign(double x, double y)
@@ -183,6 +184,7 @@ static npy_longdouble _nextl(npy_longdouble x, int p)
 {
     npy_int64 hx,ihx,ilx;
     npy_uint64 lx;
+    npy_longdouble u;
 
     GET_LDOUBLE_WORDS64(hx, lx, x);
     ihx = hx & 0x7fffffffffffffffLL;      /* |hx| */
@@ -193,7 +195,6 @@ static npy_longdouble _nextl(npy_longdouble x, int p)
         return x; /* signal the nan */
     }
     if(ihx == 0 && ilx == 0) {          /* x == 0 */
-        npy_longdouble u;
         SET_LDOUBLE_WORDS64(x, p, 0ULL);/* return +-minsubnormal */
         u = x * x;
         if (u == x) {
@@ -203,7 +204,6 @@ static npy_longdouble _nextl(npy_longdouble x, int p)
         }
     }
 
-    npy_longdouble u;
     if(p < 0) { /* p < 0, x -= ulp */
         if((hx==0xffefffffffffffffLL)&&(lx==0xfc8ffffffffffffeLL))
             return x+x; /* overflow, return -inf */
@@ -557,6 +557,15 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
 }
 #endif
 
+int npy_clear_floatstatus() {
+    char x=0;
+    return npy_clear_floatstatus_barrier(&x);
+}
+int npy_get_floatstatus() {
+    char x=0;
+    return npy_get_floatstatus_barrier(&x);
+}
+
 /*
  * Functions to set the floating point status word.
  * keep in sync with NO_FLOATING_POINT_SUPPORT in ufuncobject.h
@@ -574,18 +583,24 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
     defined(__NetBSD__)
 #include <ieeefp.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char * param))
 {
     int fpstatus = fpgetsticky();
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
     return ((FP_X_DZ  & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((FP_X_OFL & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
            ((FP_X_UFL & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
            ((FP_X_INV & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char * param)
 {
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     fpsetsticky(0);
 
     return fpstatus;
@@ -617,10 +632,16 @@ void npy_set_floatstatus_invalid(void)
       (defined(__FreeBSD__) && (__FreeBSD_version >= 502114))
 #  include <fenv.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char* param)
 {
     int fpstatus = fetestexcept(FE_DIVBYZERO | FE_OVERFLOW |
                                 FE_UNDERFLOW | FE_INVALID);
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
 
     return ((FE_DIVBYZERO  & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((FE_OVERFLOW   & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
@@ -628,10 +649,10 @@ int npy_get_floatstatus(void)
            ((FE_INVALID    & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char * param)
 {
     /* testing float status is 50-100 times faster than clearing on x86 */
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     if (fpstatus != 0) {
         feclearexcept(FE_DIVBYZERO | FE_OVERFLOW |
                       FE_UNDERFLOW | FE_INVALID);
@@ -665,18 +686,24 @@ void npy_set_floatstatus_invalid(void)
 #include <float.h>
 #include <fpxcp.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
     int fpstatus = fp_read_flag();
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
     return ((FP_DIV_BY_ZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((FP_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
            ((FP_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
            ((FP_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char * param)
 {
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     fp_swap_flag(0);
 
     return fpstatus;
@@ -710,8 +737,11 @@ void npy_set_floatstatus_invalid(void)
 #include <float.h>
 
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
 #if defined(_WIN64)
     int fpstatus = _statusfp();
 #else
@@ -720,15 +750,18 @@ int npy_get_floatstatus(void)
     _statusfp2(&fpstatus, &fpstatus2);
     fpstatus |= fpstatus2;
 #endif
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
     return ((SW_ZERODIVIDE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((SW_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
            ((SW_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
            ((SW_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char *param)
 {
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     _clearfp();
 
     return fpstatus;
@@ -739,18 +772,24 @@ int npy_clear_floatstatus(void)
 
 #include <machine/fpu.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
     unsigned long fpstatus = ieee_get_fp_control();
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
     return  ((IEEE_STATUS_DZE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
             ((IEEE_STATUS_OVF & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
             ((IEEE_STATUS_UNF & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
             ((IEEE_STATUS_INV & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char *param)
 {
-    long fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     /* clear status bits as well as disable exception mode if on */
     ieee_set_fp_control(0);
 
@@ -759,13 +798,14 @@ int npy_clear_floatstatus(void)
 
 #else
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *NPY_UNUSED(param))
 {
     return 0;
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char *param)
 {
+    int fpstatus = npy_get_floatstatus_barrier(param);
     return 0;
 }
 
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index d75b9e991..e4a919db6 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -287,8 +287,7 @@ do {                                                            \
     typedef npy_uint32 ldouble_man_t;
     typedef npy_uint32 ldouble_exp_t;
     typedef npy_uint32 ldouble_sign_t;
-#elif defined(HAVE_LDOUBLE_IEEE_DOUBLE_16_BYTES_BE) || \
-      defined(HAVE_LDOUBLE_IEEE_DOUBLE_BE)
+#elif defined(HAVE_LDOUBLE_IEEE_DOUBLE_BE)
     /* 64 bits IEEE double precision aligned on 16 bytes: used by ppc arch on
      * Mac OS X */
 
@@ -435,8 +434,8 @@ do {                                                            \
     typedef npy_uint32 ldouble_sign_t;
 #endif
 
-#if !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_BE) && \
-    !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_LE)
+#if !defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE) && \
+    !defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE)
 /* Get the sign bit of x. x should be of type IEEEl2bitsrep */
 #define GET_LDOUBLE_SIGN(x) \
     (((x).a[LDBL_SIGN_INDEX] & LDBL_SIGN_MASK) >> LDBL_SIGN_SHIFT)
@@ -477,7 +476,7 @@ do {                                                            \
      ((x).a[LDBL_MANH_INDEX] & ~LDBL_MANH_MASK) |                       \
      (((IEEEl2bitsrep_part)(v) << LDBL_MANH_SHIFT) & LDBL_MANH_MASK))
 
-#endif /* #ifndef HAVE_LDOUBLE_DOUBLE_DOUBLE_BE */
+#endif /* !HAVE_LDOUBLE_DOUBLE_DOUBLE_* */
 
 /*
  * Those unions are used to convert a pointer of npy_cdouble to native C99
diff --git a/numpy/core/src/private/npy_fpmath.h b/numpy/core/src/private/npy_fpmath.h
index 86b9cf3da..dbb3fb23d 100644
--- a/numpy/core/src/private/npy_fpmath.h
+++ b/numpy/core/src/private/npy_fpmath.h
@@ -7,45 +7,24 @@
 #include "numpy/npy_cpu.h"
 #include "numpy/npy_common.h"
 
-#ifdef NPY_OS_DARWIN
-    /* This hardcoded logic is fragile, but universal builds makes it
-     * difficult to detect arch-specific features */
-
-    /* MAC OS X < 10.4 and gcc < 4 does not support proper long double, and
-     * is the same as double on those platforms */
-    #if NPY_BITSOF_LONGDOUBLE == NPY_BITSOF_DOUBLE
-        /* This assumes that FPU and ALU have the same endianness */
-        #if NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
-            #define HAVE_LDOUBLE_IEEE_DOUBLE_LE
-        #elif NPY_BYTE_ORDER == NPY_BIG_ENDIAN
-            #define HAVE_LDOUBLE_IEEE_DOUBLE_BE
-        #else
-            #error Endianness undefined ?
-        #endif
-    #else
-        #if defined(NPY_CPU_X86)
-            #define HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE
-        #elif defined(NPY_CPU_AMD64)
-            #define HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE
-        #elif defined(NPY_CPU_PPC) || defined(NPY_CPU_PPC64)
-            #define HAVE_LDOUBLE_IEEE_DOUBLE_16_BYTES_BE
-        #elif defined(NPY_CPU_PPC64LE)
-            #define HAVE_LDOUBLE_IEEE_DOUBLE_16_BYTES_LE
-        #endif
-    #endif
-#endif
-
 #if !(defined(HAVE_LDOUBLE_IEEE_QUAD_BE) || \
       defined(HAVE_LDOUBLE_IEEE_QUAD_LE) || \
       defined(HAVE_LDOUBLE_IEEE_DOUBLE_LE) || \
       defined(HAVE_LDOUBLE_IEEE_DOUBLE_BE) || \
-      defined(HAVE_LDOUBLE_IEEE_DOUBLE_16_BYTES_BE) || \
       defined(HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE) || \
       defined(HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE) || \
       defined(HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE) || \
-      defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_BE) || \
-      defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_LE))
+      defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE) || \
+      defined(HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE))
     #error No long double representation defined
 #endif
 
+/* for back-compat, also keep old name for double-double */
+#ifdef HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE
+    #define HAVE_LDOUBLE_DOUBLE_DOUBLE_LE
+#endif
+#ifdef HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE
+    #define HAVE_LDOUBLE_DOUBLE_DOUBLE_BE
+#endif
+
 #endif
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 120ce0332..fcbdbe330 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -253,6 +253,38 @@ static void
 
 /**end repeat**/
 
+char *cumsum_signature = "(i)->(i)";
+
+/*
+ *  This implements the function
+ *        out[n] = sum_i^n in[i]
+ */
+
+/**begin repeat
+
+   #TYPE=LONG,DOUBLE#
+   #typ=npy_long,npy_double#
+*/
+
+static void
+@TYPE@_cumsum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_2
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is=steps[0], os=steps[1];
+    BEGIN_OUTER_LOOP_2
+        char *ip=args[0], *op=args[1];
+        @typ@ cumsum = 0;
+        for (i = 0; i < di; i++, ip += is, op += os) {
+            cumsum += (*(@typ@ *)ip);
+            *(@typ@ *)op = cumsum;
+        }
+    END_OUTER_LOOP
+}
+
+/**end repeat**/
+
 
 static PyUFuncGenericFunction inner1d_functions[] = { LONG_inner1d, DOUBLE_inner1d };
 static void * inner1d_data[] = { (void *)NULL, (void *)NULL };
@@ -270,8 +302,12 @@ static void *eucldiean_pdist_data[] = { (void *)NULL, (void *)NULL };
 static char euclidean_pdist_signatures[] = { NPY_FLOAT, NPY_FLOAT,
                                              NPY_DOUBLE, NPY_DOUBLE };
 
+static PyUFuncGenericFunction cumsum_functions[] = { LONG_cumsum, DOUBLE_cumsum };
+static void * cumsum_data[] = { (void *)NULL, (void *)NULL };
+static char cumsum_signatures[] = { NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE };
 
-static void
+
+static int
 addUfuncs(PyObject *dictionary) {
     PyObject *f;
 
@@ -280,6 +316,13 @@ addUfuncs(PyObject *dictionary) {
                     "inner on the last dimension and broadcast on the rest \n"
                     "     \"(i),(i)->()\" \n",
                     0, inner1d_signature);
+    /*
+     * yes, this should not happen, but I (MHvK) just spent an hour looking at
+     * segfaults because I screwed up something that seemed totally unrelated.
+     */
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "inner1d", f);
     Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(innerwt_functions, innerwt_data,
@@ -287,6 +330,9 @@ addUfuncs(PyObject *dictionary) {
                     "inner1d with a weight argument \n"
                     "     \"(i),(i),(i)->()\" \n",
                     0, innerwt_signature);
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "innerwt", f);
     Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(matrix_multiply_functions,
@@ -295,6 +341,9 @@ addUfuncs(PyObject *dictionary) {
                     "matrix multiplication on last two dimensions \n"
                     "     \"(m,n),(n,p)->(m,p)\" \n",
                     0, matrix_multiply_signature);
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "matrix_multiply", f);
     Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(euclidean_pdist_functions,
@@ -303,27 +352,48 @@ addUfuncs(PyObject *dictionary) {
                     "pairwise euclidean distance on last two dimensions \n"
                     "     \"(n,d)->(p)\" \n",
                     0, euclidean_pdist_signature);
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "euclidean_pdist", f);
     Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(cumsum_functions,
+                    cumsum_data, cumsum_signatures,
+                    2, 1, 1, PyUFunc_None, "cumsum",
+                    "Cumulative sum of the input (n)->(n)\n",
+                    0, cumsum_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "cumsum", f);
+    Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(inner1d_functions, inner1d_data,
                     inner1d_signatures, 2, 2, 1, PyUFunc_None, "inner1d_no_doc",
                     NULL,
                     0, inner1d_signature);
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "inner1d_no_doc", f);
     Py_DECREF(f);
+
+    return 0;
 }
 
 
 static PyObject *
 UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
-    int nin, nout;
+    int nin, nout, i;
     PyObject *signature, *sig_str;
-    PyObject *f;
+    PyUFuncObject *f = NULL;
+    PyObject *core_num_dims = NULL, *core_dim_ixs = NULL;
     int core_enabled;
+    int core_num_ixs = 0;
 
-    if (!PyArg_ParseTuple(args, "iiO", &nin, &nout, &signature)) return NULL;
-
+    if (!PyArg_ParseTuple(args, "iiO", &nin, &nout, &signature)) {
+        return NULL;
+    }
 
     if (PyString_Check(signature)) {
         sig_str = signature;
@@ -334,17 +404,60 @@ UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
         return NULL;
     }
 
-    f = PyUFunc_FromFuncAndDataAndSignature(NULL, NULL, NULL,
+    f = (PyUFuncObject*)PyUFunc_FromFuncAndDataAndSignature(
+        NULL, NULL, NULL,
         0, nin, nout, PyUFunc_None, "no name",
         "doc:none",
         1, PyString_AS_STRING(sig_str));
     if (sig_str != signature) {
         Py_DECREF(sig_str);
     }
-    if (f == NULL) return NULL;
-    core_enabled = ((PyUFuncObject*)f)->core_enabled;
+    if (f == NULL) {
+        return NULL;
+    }
+    core_enabled = f->core_enabled;
+    /*
+     * Don't presume core_num_dims and core_dim_ixs are defined;
+     * they currently are even if core_enabled=0, but there's no real
+     * reason they should be.  So avoid segfaults if we change our mind.
+     */
+    if (f->core_num_dims != NULL) {
+        core_num_dims = PyTuple_New(f->nargs);
+        if (core_num_dims == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < f->nargs; i++) {
+            PyObject *val = PyLong_FromLong(f->core_num_dims[i]);
+            PyTuple_SET_ITEM(core_num_dims, i, val);
+            core_num_ixs += f->core_num_dims[i];
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_num_dims = Py_None;
+    }
+    if (f->core_dim_ixs != NULL) {
+        core_dim_ixs = PyTuple_New(core_num_ixs);
+        if (core_num_dims == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < core_num_ixs; i++) {
+            PyObject * val = PyLong_FromLong(f->core_dim_ixs[i]);
+            PyTuple_SET_ITEM(core_dim_ixs, i, val);
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_dim_ixs = Py_None;
+    }
     Py_DECREF(f);
-    return Py_BuildValue("i", core_enabled);
+    return Py_BuildValue("iOO", core_enabled, core_num_dims, core_dim_ixs);
+
+fail:
+    Py_XDECREF(f);
+    Py_XDECREF(core_num_dims);
+    Py_XDECREF(core_dim_ixs);
+    return NULL;
 }
 
 static PyMethodDef UMath_TestsMethods[] = {
@@ -371,15 +484,14 @@ static struct PyModuleDef moduledef = {
 };
 #endif
 
+/* Initialization function for the module */
 #if defined(NPY_PY3K)
-#define RETVAL m
-PyMODINIT_FUNC PyInit__umath_tests(void)
+#define RETVAL(x) x
+PyMODINIT_FUNC PyInit__umath_tests(void) {
 #else
-#define RETVAL
-PyMODINIT_FUNC
-init_umath_tests(void)
+#define RETVAL(x)
+PyMODINIT_FUNC init_umath_tests(void) {
 #endif
-{
     PyObject *m;
     PyObject *d;
     PyObject *version;
@@ -389,9 +501,9 @@ init_umath_tests(void)
 #else
     m = Py_InitModule("_umath_tests", UMath_TestsMethods);
 #endif
-    if (m == NULL)
-        return RETVAL;
-
+    if (m == NULL) {
+        return RETVAL(NULL);
+    }
     import_array();
     import_ufunc();
 
@@ -402,12 +514,13 @@ init_umath_tests(void)
     Py_DECREF(version);
 
     /* Load the ufunc operators into the module's namespace */
-    addUfuncs(d);
-
-    if (PyErr_Occurred()) {
+    if (addUfuncs(d) < 0) {
+        Py_DECREF(m);
+        PyErr_Print();
         PyErr_SetString(PyExc_RuntimeError,
                         "cannot load _umath_tests module.");
+        return RETVAL(NULL);
     }
 
-    return RETVAL;
+    return RETVAL(m);
 }
diff --git a/numpy/core/src/umath/extobj.c b/numpy/core/src/umath/extobj.c
index e44036358..188054e22 100644
--- a/numpy/core/src/umath/extobj.c
+++ b/numpy/core/src/umath/extobj.c
@@ -284,7 +284,7 @@ _check_ufunc_fperr(int errmask, PyObject *extobj, const char *ufunc_name) {
     if (!errmask) {
         return 0;
     }
-    fperr = PyUFunc_getfperr();
+    fperr = npy_get_floatstatus_barrier((char*)extobj);
     if (!fperr) {
         return 0;
     }
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 8b1c7e703..1ca298b30 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1819,7 +1819,7 @@ NPY_NO_EXPORT void
             *((npy_bool *)op1) = @func@(in1) != 0;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
@@ -1866,6 +1866,9 @@ NPY_NO_EXPORT void
                 const @type@ in2 = *(@type@ *)ip2;
                 io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
             }
+            if (npy_isnan(io1)) {
+                npy_set_floatstatus_invalid();
+            }
             *((@type@ *)iop1) = io1;
         }
     }
@@ -1901,7 +1904,7 @@ NPY_NO_EXPORT void
             *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
@@ -1991,7 +1994,7 @@ NPY_NO_EXPORT void
             *((@type@ *)op1) = tmp + 0;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 
 NPY_NO_EXPORT void
@@ -2177,7 +2180,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
         const npy_half in1 = *(npy_half *)ip1;
         *((npy_bool *)op1) = @func@(in1) != 0;
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat**/
 
@@ -2239,7 +2242,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
         const npy_half in2 = *(npy_half *)ip2;
         *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat**/
 
@@ -2681,7 +2684,7 @@ NPY_NO_EXPORT void
         const @ftype@ in1i = ((@ftype@ *)ip1)[1];
         *((npy_bool *)op1) = @func@(in1r) @OP@ @func@(in1i);
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
@@ -2790,7 +2793,7 @@ NPY_NO_EXPORT void
             ((@ftype@ *)op1)[1] = in2i;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index 0aef093b0..c0bc47b7b 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -29,7 +29,10 @@ normalize_signature_keyword(PyObject *normal_kwds)
                             "cannot specify both 'sig' and 'signature'");
             return -1;
         }
-        Py_INCREF(obj);
+        /*
+         * No INCREF or DECREF needed: got a borrowed reference above,
+         * and, unlike e.g. PyList_SetItem, PyDict_SetItem INCREF's it.
+         */
         PyDict_SetItemString(normal_kwds, "signature", obj);
         PyDict_DelItemString(normal_kwds, "sig");
     }
@@ -48,6 +51,7 @@ normalize___call___args(PyUFuncObject *ufunc, PyObject *args,
     npy_intp nin = ufunc->nin;
     npy_intp nout = ufunc->nout;
     npy_intp nargs = PyTuple_GET_SIZE(args);
+    npy_intp nkwds = PyDict_Size(*normal_kwds);
     PyObject *obj;
 
     if (nargs < nin) {
@@ -71,7 +75,7 @@ normalize___call___args(PyUFuncObject *ufunc, PyObject *args,
 
     /* If we have more args than nin, they must be the output variables.*/
     if (nargs > nin) {
-        if(PyDict_GetItemString(*normal_kwds, "out")) {
+        if(nkwds > 0 && PyDict_GetItemString(*normal_kwds, "out")) {
             PyErr_Format(PyExc_TypeError,
                          "argument given by name ('out') and position "
                          "(%"NPY_INTP_FMT")", nin);
@@ -109,8 +113,15 @@ normalize___call___args(PyUFuncObject *ufunc, PyObject *args,
             Py_DECREF(obj);
         }
     }
+    /* gufuncs accept either 'axes' or 'axis', but not both */
+    if (nkwds >= 2 && (PyDict_GetItemString(*normal_kwds, "axis") &&
+                       PyDict_GetItemString(*normal_kwds, "axes"))) {
+        PyErr_SetString(PyExc_TypeError,
+                        "cannot specify both 'axis' and 'axes'");
+        return -1;
+    }
     /* finally, ufuncs accept 'sig' or 'signature' normalize to 'signature' */
-    return normalize_signature_keyword(*normal_kwds);
+    return nkwds == 0 ? 0 : normalize_signature_keyword(*normal_kwds);
 }
 
 static int
@@ -123,11 +134,16 @@ normalize_reduce_args(PyUFuncObject *ufunc, PyObject *args,
     npy_intp nargs = PyTuple_GET_SIZE(args);
     npy_intp i;
     PyObject *obj;
-    static char *kwlist[] = {"array", "axis", "dtype", "out", "keepdims"};
+    static PyObject *NoValue = NULL;
+    static char *kwlist[] = {"array", "axis", "dtype", "out", "keepdims",
+        "initial"};
+
+    npy_cache_import("numpy", "_NoValue", &NoValue);
+    if (NoValue == NULL) return -1;
 
-    if (nargs < 1 || nargs > 5) {
+    if (nargs < 1 || nargs > 6) {
         PyErr_Format(PyExc_TypeError,
-                     "ufunc.reduce() takes from 1 to 5 positional "
+                     "ufunc.reduce() takes from 1 to 6 positional "
                      "arguments but %"NPY_INTP_FMT" were given", nargs);
         return -1;
     }
@@ -151,6 +167,10 @@ normalize_reduce_args(PyUFuncObject *ufunc, PyObject *args,
             }
             obj = PyTuple_GetSlice(args, 3, 4);
         }
+        /* Remove initial=np._NoValue */
+        if (i == 5 && obj == NoValue) {
+            continue;
+        }
         PyDict_SetItemString(*normal_kwds, kwlist[i], obj);
         if (i == 3) {
             Py_DECREF(obj);
@@ -282,7 +302,6 @@ normalize_outer_args(PyUFuncObject *ufunc, PyObject *args,
     if (*normal_args == NULL) {
         return -1;
     }
-
     /* ufuncs accept 'sig' or 'signature' normalize to 'signature' */
     return normalize_signature_keyword(*normal_kwds);
 }
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 681d3fefa..5c3a84e21 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -537,7 +537,7 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
     }
 
     /* Start with the floating-point exception flags cleared */
-    PyUFunc_clearfperr();
+    npy_clear_floatstatus_barrier((char*)&iter);
 
     if (NpyIter_GetIterSize(iter) != 0) {
         NpyIter_IterNextFunc *iternext;
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 6e1fb1ee8..3e29c4b4e 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -848,7 +848,7 @@ static PyObject *
     }
 
 #if @fperr@
-    PyUFunc_clearfperr();
+    npy_clear_floatstatus_barrier((char*)&out);
 #endif
 
     /*
@@ -863,7 +863,7 @@ static PyObject *
 
 #if @fperr@
     /* Check status flag.  If it is set, then look up what to do */
-    retstatus = PyUFunc_getfperr();
+    retstatus = npy_get_floatstatus_barrier((char*)&out);
     if (retstatus) {
         int bufsize, errmask;
         PyObject *errobj;
@@ -993,7 +993,7 @@ static PyObject *
         return Py_NotImplemented;
     }
 
-    PyUFunc_clearfperr();
+    npy_clear_floatstatus_barrier((char*)&out);
 
     /*
      * here we do the actual calculation with arg1 and arg2
@@ -1008,7 +1008,7 @@ static PyObject *
     }
 
     /* Check status flag.  If it is set, then look up what to do */
-    retstatus = PyUFunc_getfperr();
+    retstatus = npy_get_floatstatus_barrier((char*)&out);
     if (retstatus) {
         int bufsize, errmask;
         PyObject *errobj;
@@ -1072,7 +1072,7 @@ static PyObject *
         return Py_NotImplemented;
     }
 
-    PyUFunc_clearfperr();
+    npy_clear_floatstatus_barrier((char*)&out);
 
     /*
      * here we do the actual calculation with arg1 and arg2
@@ -1136,7 +1136,7 @@ static PyObject *
         return Py_NotImplemented;
     }
 
-    PyUFunc_clearfperr();
+    npy_clear_floatstatus_barrier((char*)&out);
 
     /*
      * here we do the actual calculation with arg1 and arg2
@@ -1150,7 +1150,7 @@ static PyObject *
     }
 
     /* Check status flag.  If it is set, then look up what to do */
-    retstatus = PyUFunc_getfperr();
+    retstatus = npy_get_floatstatus_barrier((char*)&out);
     if (retstatus) {
         int bufsize, errmask;
         PyObject *errobj;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 2241414ac..5c0568c12 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -1031,7 +1031,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
         i += 2 * stride;
 
         /* minps/minpd will set invalid flag if nan is encountered */
-        npy_clear_floatstatus();
+        npy_clear_floatstatus_barrier((char*)&c1);
         LOOP_BLOCKED(@type@, 32) {
             @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
             @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
@@ -1040,7 +1040,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
         }
         c1 = @vpre@_@VOP@_@vsuf@(c1, c2);
 
-        if (npy_get_floatstatus() & NPY_FPE_INVALID) {
+        if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) {
             *op = @nan@;
         }
         else {
@@ -1051,6 +1051,9 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
     LOOP_BLOCKED_END {
         *op  = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
     }
+    if (npy_isnan(*op)) {
+        npy_set_floatstatus_invalid();
+    }
 }
 /**end repeat1**/
 
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index e0423630b..9b03a7916 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -65,6 +65,28 @@
 #endif
 /**********************************************/
 
+typedef struct {
+    PyObject *in;   /* The input arguments to the ufunc, a tuple */
+    PyObject *out;  /* The output arguments, a tuple. If no non-None outputs are
+                       provided, then this is NULL. */
+} ufunc_full_args;
+
+/* Get the arg tuple to pass in the context argument to __array_wrap__ and
+ * __array_prepare__.
+ *
+ * Output arguments are only passed if at least one is non-None.
+ */
+static PyObject *
+_get_wrap_prepare_args(ufunc_full_args full_args) {
+    if (full_args.out == NULL) {
+        Py_INCREF(full_args.in);
+        return full_args.in;
+    }
+    else {
+        return PySequence_Concat(full_args.in, full_args.out);
+    }
+}
+
 /* ---------------------------------------------------------------- */
 
 static int
@@ -78,7 +100,8 @@ PyUFunc_getfperr(void)
      * non-clearing get was only added in 1.9 so this function always cleared
      * keep it so just in case third party code relied on the clearing
      */
-    return npy_clear_floatstatus();
+    char param = 0;
+    return npy_clear_floatstatus_barrier(&param);
 }
 
 #define HANDLEIT(NAME, str) {if (retstatus & NPY_FPE_##NAME) {          \
@@ -111,7 +134,8 @@ NPY_NO_EXPORT int
 PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first)
 {
     /* clearing is done for backward compatibility */
-    int retstatus = npy_clear_floatstatus();
+    int retstatus;
+    retstatus = npy_clear_floatstatus_barrier((char*)&retstatus);
 
     return PyUFunc_handlefperr(errmask, errobj, retstatus, first);
 }
@@ -122,7 +146,8 @@ PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first)
 NPY_NO_EXPORT void
 PyUFunc_clearfperr()
 {
-    npy_clear_floatstatus();
+    char param = 0;
+    npy_clear_floatstatus_barrier(&param);
 }
 
 /*
@@ -132,7 +157,7 @@ PyUFunc_clearfperr()
  * defines the method.
  */
 static PyObject*
-_find_array_method(PyObject *args, int nin, PyObject *method_name)
+_find_array_method(PyObject *args, PyObject *method_name)
 {
     int i, n_methods;
     PyObject *obj;
@@ -140,7 +165,7 @@ _find_array_method(PyObject *args, int nin, PyObject *method_name)
     PyObject *method = NULL;
 
     n_methods = 0;
-    for (i = 0; i < nin; i++) {
+    for (i = 0; i < PyTuple_GET_SIZE(args); i++) {
         obj = PyTuple_GET_ITEM(args, i);
         if (PyArray_CheckExact(obj) || PyArray_IsAnyScalar(obj)) {
             continue;
@@ -238,17 +263,17 @@ _get_output_array_method(PyObject *obj, PyObject *method,
  * should just have PyArray_Return called.
  */
 static void
-_find_array_prepare(PyObject *args, PyObject *kwds,
+_find_array_prepare(ufunc_full_args args,
                     PyObject **output_prep, int nin, int nout)
 {
-    Py_ssize_t nargs;
     int i;
+    PyObject *prep;
 
     /*
      * Determine the prepping function given by the input arrays
      * (could be NULL).
      */
-    PyObject *prep = _find_array_method(args, nin, npy_um_str_array_prepare);
+    prep = _find_array_method(args.in, npy_um_str_array_prepare);
     /*
      * For all the output arrays decide what to do.
      *
@@ -261,29 +286,16 @@ _find_array_prepare(PyObject *args, PyObject *kwds,
      * exact ndarray so that no PyArray_Return is
      * done in that case.
      */
-    nargs = PyTuple_GET_SIZE(args);
-    for (i = 0; i < nout; i++) {
-        int j = nin + i;
-        PyObject *obj = NULL;
-        if (j < nargs) {
-            obj = PyTuple_GET_ITEM(args, j);
-            /* Output argument one may also be in a keyword argument */
-            if (i == 0 && obj == Py_None && kwds != NULL) {
-                obj = PyDict_GetItem(kwds, npy_um_str_out);
-            }
-        }
-        /* Output argument one may also be in a keyword argument */
-        else if (i == 0 && kwds != NULL) {
-            obj = PyDict_GetItem(kwds, npy_um_str_out);
-        }
-
-        if (obj == NULL) {
+    if (args.out == NULL) {
+        for (i = 0; i < nout; i++) {
             Py_XINCREF(prep);
             output_prep[i] = prep;
         }
-        else {
+    }
+    else {
+        for (i = 0; i < nout; i++) {
             output_prep[i] = _get_output_array_method(
-                    obj, npy_um_str_array_prepare, prep);
+                PyTuple_GET_ITEM(args.out, i), npy_um_str_array_prepare, prep);
         }
     }
     Py_XDECREF(prep);
@@ -543,8 +555,9 @@ ufunc_get_name_cstr(PyUFuncObject *ufunc) {
  * Parses the positional and keyword arguments for a generic ufunc call.
  *
  * Note that if an error is returned, the caller must free the
- * non-zero references in out_op.  This
- * function does not do its own clean-up.
+ * non-zero references in out_op.  This function does not do its own clean-up.
+ *
+ * Note also that all the outputs from keyword arguments contain new references.
  */
 static int
 get_ufunc_arguments(PyUFuncObject *ufunc,
@@ -553,10 +566,12 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
                     NPY_ORDER *out_order,
                     NPY_CASTING *out_casting,
                     PyObject **out_extobj,
-                    PyObject **out_typetup,
-                    int *out_subok,
-                    PyArrayObject **out_wheremask,
-                    PyObject **out_axes)
+                    PyObject **out_typetup,  /* type: Tuple[np.dtype] */
+                    int *out_subok,  /* bool */
+                    PyArrayObject **out_wheremask, /* PyArray of bool */
+                    PyObject **out_axes,  /* type: List[Tuple[T]] */
+                    PyObject **out_axis,  /* type: T */
+                    int *out_keepdims)  /* bool */
 {
     int i, nargs;
     int nin = ufunc->nin;
@@ -574,6 +589,9 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
     if (out_axes != NULL) {
         *out_axes = NULL;
     }
+    if (out_axis != NULL) {
+        *out_axis = NULL;
+    }
     if (out_wheremask != NULL) {
         *out_wheremask = NULL;
     }
@@ -811,11 +829,27 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
 
             switch (str[0]) {
                 case 'a':
-                    /* possible axis argument for generalized ufunc */
+                    /* possible axes argument for generalized ufunc */
                     if (out_axes != NULL && strcmp(str, "axes") == 0) {
+                        if (out_axis != NULL && *out_axis != NULL) {
+                            PyErr_SetString(PyExc_TypeError,
+                                "cannot specify both 'axis' and 'axes'");
+                            goto fail;
+                        }
+                        Py_INCREF(value);
                         *out_axes = value;
                         bad_arg = 0;
                     }
+                    else if (out_axis != NULL && strcmp(str, "axis") == 0) {
+                        if (out_axes != NULL && *out_axes != NULL) {
+                            PyErr_SetString(PyExc_TypeError,
+                                "cannot specify both 'axis' and 'axes'");
+                            goto fail;
+                        }
+                        Py_INCREF(value);
+                        *out_axis = value;
+                        bad_arg = 0;
+                    }
                     break;
                 case 'c':
                     /* Provides a policy for allowed casting */
@@ -837,7 +871,7 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
                         if (dtype != NULL) {
                             if (*out_typetup != NULL) {
                                 PyErr_SetString(PyExc_RuntimeError,
-                                    "cannot specify both 'sig' and 'dtype'");
+                                    "cannot specify both 'signature' and 'dtype'");
                                 goto fail;
                             }
                             *out_typetup = Py_BuildValue("(N)", dtype);
@@ -851,10 +885,22 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
                      * error mask, and error object
                      */
                     if (strcmp(str, "extobj") == 0) {
+                        Py_INCREF(value);
                         *out_extobj = value;
                         bad_arg = 0;
                     }
                     break;
+                case 'k':
+                    if (out_keepdims != NULL && strcmp(str, "keepdims") == 0) {
+                        if (!PyBool_Check(value)) {
+                            PyErr_SetString(PyExc_TypeError,
+                                        "'keepdims' must be a boolean");
+                            goto fail;
+                        }
+                        *out_keepdims = (value == Py_True);
+                        bad_arg = 0;
+                    }
+                    break;
                 case 'o':
                     /*
                      * Output arrays may be specified as a keyword argument,
@@ -940,11 +986,11 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
                         }
                         if (*out_typetup != NULL) {
                             PyErr_SetString(PyExc_RuntimeError,
-                                    "cannot specify both 'sig' and 'dtype'");
+                                    "cannot specify both 'signature' and 'dtype'");
                             goto fail;
                         }
-                        *out_typetup = value;
                         Py_INCREF(value);
+                        *out_typetup = value;
                         bad_arg = 0;
                         has_sig = 1;
                     }
@@ -1002,17 +1048,24 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
 
 fail:
     Py_XDECREF(str_key_obj);
-    Py_XDECREF(*out_extobj);
-    *out_extobj = NULL;
+    /*
+     * XDECREF any output kwargs that were assigned, and set them to NULL.
+     */
     Py_XDECREF(*out_typetup);
     *out_typetup = NULL;
+    Py_XDECREF(*out_extobj);
+    *out_extobj = NULL;
+    if (out_wheremask != NULL) {
+        Py_XDECREF(*out_wheremask);
+        *out_wheremask = NULL;
+    }
     if (out_axes != NULL) {
         Py_XDECREF(*out_axes);
         *out_axes = NULL;
     }
-    if (out_wheremask != NULL) {
-        Py_XDECREF(*out_wheremask);
-        *out_wheremask = NULL;
+    if (out_axis != NULL) {
+        Py_XDECREF(*out_axis);
+        *out_axis = NULL;
     }
     return -1;
 }
@@ -1141,22 +1194,31 @@ static int
 prepare_ufunc_output(PyUFuncObject *ufunc,
                     PyArrayObject **op,
                     PyObject *arr_prep,
-                    PyObject *arr_prep_args,
+                    ufunc_full_args full_args,
                     int i)
 {
     if (arr_prep != NULL && arr_prep != Py_None) {
         PyObject *res;
         PyArrayObject *arr;
+        PyObject *args_tup;
 
-        res = PyObject_CallFunction(arr_prep, "O(OOi)",
-                    *op, ufunc, arr_prep_args, i);
-        if ((res == NULL) || (res == Py_None) || !PyArray_Check(res)) {
-            if (!PyErr_Occurred()){
-                PyErr_SetString(PyExc_TypeError,
-                        "__array_prepare__ must return an "
-                        "ndarray or subclass thereof");
-            }
-            Py_XDECREF(res);
+        /* Call with the context argument */
+        args_tup = _get_wrap_prepare_args(full_args);
+        if (args_tup == NULL) {
+            return -1;
+        }
+        res = PyObject_CallFunction(
+            arr_prep, "O(OOi)", *op, ufunc, args_tup, i);
+        Py_DECREF(args_tup);
+
+        if (res == NULL) {
+            return -1;
+        }
+        else if (!PyArray_Check(res)) {
+            PyErr_SetString(PyExc_TypeError,
+                    "__array_prepare__ must return an "
+                    "ndarray or subclass thereof");
+            Py_DECREF(res);
             return -1;
         }
         arr = (PyArrayObject *)res;
@@ -1199,7 +1261,7 @@ iterator_loop(PyUFuncObject *ufunc,
                     NPY_ORDER order,
                     npy_intp buffersize,
                     PyObject **arr_prep,
-                    PyObject *arr_prep_args,
+                    ufunc_full_args full_args,
                     PyUFuncGenericFunction innerloop,
                     void *innerloopdata)
 {
@@ -1261,7 +1323,7 @@ iterator_loop(PyUFuncObject *ufunc,
             continue;
         }
         if (prepare_ufunc_output(ufunc, &op[nin+i],
-                            arr_prep[i], arr_prep_args, i) < 0) {
+                            arr_prep[i], full_args, i) < 0) {
             return -1;
         }
     }
@@ -1289,7 +1351,7 @@ iterator_loop(PyUFuncObject *ufunc,
 
             /* Call the __array_prepare__ functions for the new array */
             if (prepare_ufunc_output(ufunc, &op[nin+i],
-                                     arr_prep[i], arr_prep_args, i) < 0) {
+                                     arr_prep[i], full_args, i) < 0) {
                 NpyIter_Close(iter);
                 NpyIter_Deallocate(iter);
                 return -1;
@@ -1369,7 +1431,7 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
                     NPY_ORDER order,
                     npy_intp buffersize,
                     PyObject **arr_prep,
-                    PyObject *arr_prep_args)
+                    ufunc_full_args full_args)
 {
     npy_intp nin = ufunc->nin, nout = ufunc->nout;
     PyUFuncGenericFunction innerloop;
@@ -1406,7 +1468,7 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
 
                 /* Call the __prepare_array__ if necessary */
                 if (prepare_ufunc_output(ufunc, &op[1],
-                                    arr_prep[0], arr_prep_args, 0) < 0) {
+                                    arr_prep[0], full_args, 0) < 0) {
                     return -1;
                 }
 
@@ -1423,7 +1485,7 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
 
                 /* Call the __prepare_array__ if necessary */
                 if (prepare_ufunc_output(ufunc, &op[1],
-                                    arr_prep[0], arr_prep_args, 0) < 0) {
+                                    arr_prep[0], full_args, 0) < 0) {
                     return -1;
                 }
 
@@ -1465,7 +1527,7 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
 
                 /* Call the __prepare_array__ if necessary */
                 if (prepare_ufunc_output(ufunc, &op[2],
-                                    arr_prep[0], arr_prep_args, 0) < 0) {
+                                    arr_prep[0], full_args, 0) < 0) {
                     return -1;
                 }
 
@@ -1484,7 +1546,7 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
 
                 /* Call the __prepare_array__ if necessary */
                 if (prepare_ufunc_output(ufunc, &op[2],
-                                    arr_prep[0], arr_prep_args, 0) < 0) {
+                                    arr_prep[0], full_args, 0) < 0) {
                     return -1;
                 }
 
@@ -1503,7 +1565,7 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
 
     NPY_UF_DBG_PRINT("iterator loop\n");
     if (iterator_loop(ufunc, op, dtypes, order,
-                    buffersize, arr_prep, arr_prep_args,
+                    buffersize, arr_prep, full_args,
                     innerloop, innerloopdata) < 0) {
         return -1;
     }
@@ -1530,7 +1592,7 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
                     NPY_ORDER order,
                     npy_intp buffersize,
                     PyObject **arr_prep,
-                    PyObject *arr_prep_args)
+                    ufunc_full_args full_args)
 {
     int retval, i, nin = ufunc->nin, nout = ufunc->nout;
     int nop = nin + nout;
@@ -1643,7 +1705,7 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
         Py_INCREF(op_tmp);
 
         if (prepare_ufunc_output(ufunc, &op_tmp,
-                                 arr_prep[i], arr_prep_args, i) < 0) {
+                                 arr_prep[i], full_args, i) < 0) {
             NpyIter_Close(iter);
             NpyIter_Deallocate(iter);
             return -1;
@@ -1727,42 +1789,113 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
     return retval;
 }
 
-static PyObject *
-make_arr_prep_args(npy_intp nin, PyObject *args, PyObject *kwds)
+static npy_bool
+tuple_all_none(PyObject *tup) {
+    npy_intp i;
+    for (i = 0; i < PyTuple_GET_SIZE(tup); ++i) {
+        if (PyTuple_GET_ITEM(tup, i) != Py_None) {
+            return NPY_FALSE;
+        }
+    }
+    return NPY_TRUE;
+}
+
+/*
+ * Convert positional args and the out kwarg into an input and output tuple.
+ *
+ * If the output tuple would be all None, return NULL instead.
+ *
+ * This duplicates logic in many places, so further refactoring is needed:
+ *  - get_ufunc_arguments
+ *  - PyUFunc_WithOverride
+ *  - normalize___call___args
+ */
+static int
+make_full_arg_tuple(
+        ufunc_full_args *full_args,
+        npy_intp nin, npy_intp nout,
+        PyObject *args, PyObject *kwds)
 {
-    PyObject *out = kwds ? PyDict_GetItem(kwds, npy_um_str_out) : NULL;
-    PyObject *arr_prep_args;
+    PyObject *out_kwd = NULL;
+    npy_intp nargs = PyTuple_GET_SIZE(args);
+    npy_intp i;
 
-    if (out == NULL) {
-        Py_INCREF(args);
-        return args;
+    /* This should have been checked by the caller */
+    assert(nin <= nargs && nargs <= nin + nout);
+
+    /* Initialize so we can XDECREF safely */
+    full_args->in = NULL;
+    full_args->out = NULL;
+
+    /* Get the input arguments*/
+    full_args->in = PyTuple_GetSlice(args, 0, nin);
+    if (full_args->in == NULL) {
+        goto fail;
     }
-    else {
-        npy_intp i, nargs = PyTuple_GET_SIZE(args), n;
-        n = nargs;
-        if (n < nin + 1) {
-            n = nin + 1;
-        }
-        arr_prep_args = PyTuple_New(n);
-        if (arr_prep_args == NULL) {
-            return NULL;
+
+    /* Look for output keyword arguments */
+    out_kwd = kwds ? PyDict_GetItem(kwds, npy_um_str_out) : NULL;
+
+    if (out_kwd != NULL) {
+        assert(nargs == nin);
+        if (out_kwd == Py_None) {
+            return 0;
         }
-        /* Copy the tuple, but set the nin-th item to the keyword arg */
-        for (i = 0; i < nin; ++i) {
-            PyObject *item = PyTuple_GET_ITEM(args, i);
-            Py_INCREF(item);
-            PyTuple_SET_ITEM(arr_prep_args, i, item);
+        else if (PyTuple_Check(out_kwd)) {
+            assert(PyTuple_GET_SIZE(out_kwd) == nout);
+            if (tuple_all_none(out_kwd)) {
+                return 0;
+            }
+            Py_INCREF(out_kwd);
+            full_args->out = out_kwd;
+            return 0;
         }
-        Py_INCREF(out);
-        PyTuple_SET_ITEM(arr_prep_args, nin, out);
-        for (i = nin+1; i < n; ++i) {
-            PyObject *item = PyTuple_GET_ITEM(args, i);
-            Py_INCREF(item);
-            PyTuple_SET_ITEM(arr_prep_args, i, item);
+        else {
+            /* A single argument x is promoted to (x, None, None ...) */
+            full_args->out = PyTuple_New(nout);
+            if (full_args->out == NULL) {
+                goto fail;
+            }
+            Py_INCREF(out_kwd);
+            PyTuple_SET_ITEM(full_args->out, 0, out_kwd);
+            for (i = 1; i < nout; ++i) {
+                Py_INCREF(Py_None);
+                PyTuple_SET_ITEM(full_args->out, i, Py_None);
+            }
+            return 0;
         }
+    }
 
-        return arr_prep_args;
+    /* No outputs in kwargs; if also none in args, we're done */
+    if (nargs == nin) {
+        return 0;
     }
+    /* copy across positional output arguments, adding trailing Nones */
+    full_args->out = PyTuple_New(nout);
+    if (full_args->out == NULL) {
+        goto fail;
+    }
+    for (i = nin; i < nargs; ++i) {
+        PyObject *item = PyTuple_GET_ITEM(args, i);
+        Py_INCREF(item);
+        PyTuple_SET_ITEM(full_args->out, i - nin, item);
+    }
+    for (i = nargs; i < nin + nout; ++i) {
+        Py_INCREF(Py_None);
+        PyTuple_SET_ITEM(full_args->out, i - nin, Py_None);
+    }
+
+    /* don't return a tuple full of None */
+    if (tuple_all_none(full_args->out)) {
+        Py_DECREF(full_args->out);
+        full_args->out = NULL;
+    }
+    return 0;
+
+fail:
+    Py_XDECREF(full_args->in);
+    Py_XDECREF(full_args->out);
+    return -1;
 }
 
 /*
@@ -1780,6 +1913,56 @@ _has_output_coredims(PyUFuncObject *ufunc) {
 }
 
 /*
+ * Check whether the gufunc can be used with axis, i.e., that there is only
+ * a single, shared core dimension (which means that operands either have
+ * that dimension, or have no core dimensions).  Returns 0 if all is fine,
+ * and sets an error and returns -1 if not.
+ */
+static int
+_check_axis_support(PyUFuncObject *ufunc) {
+    if (ufunc->core_num_dim_ix != 1) {
+        PyErr_Format(PyExc_TypeError,
+                     "%s: axis can only be used with a single shared core "
+                     "dimension, not with the %d distinct ones implied by "
+                     "signature %s.",
+                     ufunc_get_name_cstr(ufunc),
+                     ufunc->core_num_dim_ix,
+                     ufunc->core_signature);
+        return -1;
+    }
+    return 0;
+}
+
+/*
+ * Check whether the gufunc can be used with keepdims, i.e., that all its
+ * input arguments have the same number of core dimension, and all output
+ * arguments have no core dimensions. Returns 0 if all is fine, and sets
+ * an error and returns -1 if not.
+ */
+static int
+_check_keepdims_support(PyUFuncObject *ufunc) {
+    int i;
+    int nin = ufunc->nin, nout = ufunc->nout;
+    int input_core_dims = ufunc->core_num_dims[0];
+    for (i = 1; i < nin + nout; i++) {
+        if (ufunc->core_num_dims[i] != (i < nin ? input_core_dims : 0)) {
+            PyErr_Format(PyExc_TypeError,
+                "%s does not support keepdims: its signature %s requires "
+                "%s %d to have %d core dimensions, but keepdims can only "
+                "be used when all inputs have the same number of core "
+                "dimensions and all outputs have no core dimensions.",
+                ufunc_get_name_cstr(ufunc),
+                ufunc->core_signature,
+                i < nin ? "input" : "output",
+                i < nin ? i : i - nin,
+                ufunc->core_num_dims[i]);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+/*
  * Interpret a possible axes keyword argument, using it to fill the remap_axis
  * array which maps default to actual axes for each operand, indexed as
  * as remap_axis[iop][iaxis]. The default axis order has first all broadcast
@@ -1788,11 +1971,10 @@ _has_output_coredims(PyUFuncObject *ufunc) {
  * Returns 0 on success, and -1 on failure
  */
 static int
-_parse_axes_arg(PyUFuncObject *ufunc, PyObject *axes, PyArrayObject **op,
-                int broadcast_ndim, int **remap_axis) {
+_parse_axes_arg(PyUFuncObject *ufunc, int core_num_dims[], PyObject *axes,
+                PyArrayObject **op, int broadcast_ndim, int **remap_axis) {
     int nin = ufunc->nin;
-    int nout = ufunc->nout;
-    int nop = nin + nout;
+    int nop = ufunc->nargs;
     int iop, list_size;
 
     if (!PyList_Check(axes)) {
@@ -1819,7 +2001,7 @@ _parse_axes_arg(PyUFuncObject *ufunc, PyObject *axes, PyArrayObject **op,
         PyObject *op_axes_tuple, *axis_item;
         int axis, op_axis;
 
-        op_ncore = ufunc->core_num_dims[iop];
+        op_ncore = core_num_dims[iop];
         if (op[iop] != NULL) {
             op_ndim = PyArray_NDIM(op[iop]);
             op_nbroadcast = op_ndim - op_ncore;
@@ -1910,6 +2092,59 @@ _parse_axes_arg(PyUFuncObject *ufunc, PyObject *axes, PyArrayObject **op,
     return 0;
 }
 
+/*
+ * Simplified version of the above, using axis to fill the remap_axis
+ * array, which maps default to actual axes for each operand, indexed as
+ * as remap_axis[iop][iaxis]. The default axis order has first all broadcast
+ * axes and then the core axes the gufunc operates on.
+ *
+ * Returns 0 on success, and -1 on failure
+ */
+static int
+_parse_axis_arg(PyUFuncObject *ufunc, int core_num_dims[], PyObject *axis,
+                PyArrayObject **op, int broadcast_ndim, int **remap_axis) {
+    int nop = ufunc->nargs;
+    int iop, axis_int;
+
+    axis_int = PyArray_PyIntAsInt(axis);
+    if (error_converting(axis_int)) {
+        return -1;
+    }
+
+    for (iop = 0; iop < nop; ++iop) {
+        int axis, op_ndim, op_axis;
+
+        /* _check_axis_support ensures core_num_dims is 0 or 1 */
+        if (core_num_dims[iop] == 0) {
+            remap_axis[iop] = NULL;
+            continue;
+        }
+        if (op[iop]) {
+            op_ndim = PyArray_NDIM(op[iop]);
+        }
+        else {
+            op_ndim = broadcast_ndim + 1;
+        }
+        op_axis = axis_int;  /* ensure we don't modify axis_int */
+        if (check_and_adjust_axis(&op_axis, op_ndim) < 0) {
+            return -1;
+        }
+        /* Are we actually remapping away from last axis? */
+        if (op_axis == op_ndim - 1) {
+            remap_axis[iop] = NULL;
+            continue;
+        }
+        remap_axis[iop][op_ndim - 1] = op_axis;
+        for (axis = 0; axis < op_axis; axis++) {
+            remap_axis[iop][axis] = axis;
+        }
+        for (axis = op_axis; axis < op_ndim - 1; axis++) {
+            remap_axis[iop][axis] = axis + 1;
+        }
+    } /* end of for(iop) loop over operands */
+    return 0;
+}
+
 #define REMAP_AXIS(iop, axis) ((remap_axis != NULL && \
                                 remap_axis[iop] != NULL)? \
                                remap_axis[iop][axis] : axis)
@@ -2069,6 +2304,8 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
 
     /* Use remapped axes for generalized ufunc */
     int broadcast_ndim, iter_ndim;
+    int core_num_dims_array[NPY_MAXARGS];
+    int *core_num_dims;
     int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
     int *op_axes[NPY_MAXARGS];
 
@@ -2097,17 +2334,15 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     int **remap_axis = NULL;
     /* The __array_prepare__ function to call for each output */
     PyObject *arr_prep[NPY_MAXARGS];
-    /*
-     * This is either args, or args with the out= parameter from
-     * kwds added appropriately.
-     */
-    PyObject *arr_prep_args = NULL;
+    /* The separated input and output arguments, parsed from args and kwds */
+    ufunc_full_args full_args = {NULL, NULL};
 
     NPY_ORDER order = NPY_KEEPORDER;
     /* Use the default assignment casting rule */
     NPY_CASTING casting = NPY_DEFAULT_ASSIGN_CASTING;
-    /* When provided, extobj, typetup, and axes contain borrowed references */
-    PyObject *extobj = NULL, *type_tup = NULL, *axes = NULL;
+    /* other possible keyword arguments */
+    PyObject *extobj = NULL, *type_tup = NULL, *axes = NULL, *axis = NULL;
+    int keepdims = -1;
 
     if (ufunc == NULL) {
         PyErr_SetString(PyExc_ValueError, "function not supported");
@@ -2131,28 +2366,64 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
 
     NPY_UF_DBG_PRINT("Getting arguments\n");
 
-    /* Get all the arguments */
+    /*
+     * Get all the arguments.
+     */
     retval = get_ufunc_arguments(ufunc, args, kwds,
                 op, &order, &casting, &extobj,
-                &type_tup, &subok, NULL, &axes);
+                &type_tup, &subok, NULL, &axes, &axis, &keepdims);
     if (retval < 0) {
         goto fail;
     }
-
+    /*
+     * If keepdims was passed in (and thus changed from the initial value
+     * on top), check the gufunc is suitable, i.e., that its inputs share
+     * the same number of core dimensions, and its outputs have none.
+     */
+    if (keepdims != -1) {
+        retval = _check_keepdims_support(ufunc);
+        if (retval < 0) {
+            goto fail;
+        }
+    }
+    if (axis != NULL) {
+        retval = _check_axis_support(ufunc);
+        if (retval < 0) {
+            goto fail;
+        }
+    }
+    /*
+     * If keepdims is set and true, signal all dimensions will be the same.
+     */
+    if (keepdims == 1) {
+        int num_dims = ufunc->core_num_dims[0];
+        for (i = 0; i < nop; ++i) {
+            core_num_dims_array[i] = num_dims;
+        }
+        core_num_dims = core_num_dims_array;
+    }
+    else {
+        /* keepdims was not set or was false; no adjustment necessary */
+        core_num_dims = ufunc->core_num_dims;
+        keepdims = 0;
+    }
     /*
      * Check that operands have the minimum dimensions required.
      * (Just checks core; broadcast dimensions are tested by the iterator.)
      */
     for (i = 0; i < nop; i++) {
-        if (op[i] != NULL && PyArray_NDIM(op[i]) < ufunc->core_num_dims[i]) {
+        if (op[i] != NULL && PyArray_NDIM(op[i]) < core_num_dims[i]) {
             PyErr_Format(PyExc_ValueError,
                          "%s: %s operand %d does not have enough "
                          "dimensions (has %d, gufunc core with "
                          "signature %s requires %d)",
-                         ufunc_get_name_cstr(ufunc),
+                         ufunc_name,
                          i < nin ? "Input" : "Output",
-                         i < nin ? i : i - nin, PyArray_NDIM(op[i]),
-                         ufunc->core_signature, ufunc->core_num_dims[i]);
+                         i < nin ? i : i - nin,
+                         PyArray_NDIM(op[i]),
+                         ufunc->core_signature,
+                         core_num_dims[i]);
+            retval = -1;
             goto fail;
         }
     }
@@ -2164,7 +2435,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
      */
     broadcast_ndim = 0;
     for (i = 0; i < nin; ++i) {
-        int n = PyArray_NDIM(op[i]) - ufunc->core_num_dims[i];
+        int n = PyArray_NDIM(op[i]) - core_num_dims[i];
         if (n > broadcast_ndim) {
             broadcast_ndim = n;
         }
@@ -2178,7 +2449,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
      */
     iter_ndim = broadcast_ndim;
     for (i = nin; i < nop; ++i) {
-        iter_ndim += ufunc->core_num_dims[i];
+        iter_ndim += core_num_dims[i];
     }
     if (iter_ndim > NPY_MAXDIMS) {
         PyErr_Format(PyExc_ValueError,
@@ -2189,7 +2460,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     }
 
     /* Possibly remap axes. */
-    if (axes) {
+    if (axes != NULL || axis != NULL) {
         remap_axis = PyArray_malloc(sizeof(remap_axis[0]) * nop);
         remap_axis_memory = PyArray_malloc(sizeof(remap_axis_memory[0]) *
                                                   nop * NPY_MAXDIMS);
@@ -2200,8 +2471,14 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
         for (i=0; i < nop; i++) {
             remap_axis[i] = remap_axis_memory + i * NPY_MAXDIMS;
         }
-        retval = _parse_axes_arg(ufunc, axes, op, broadcast_ndim,
-                                 remap_axis);
+        if (axis) {
+            retval = _parse_axis_arg(ufunc, core_num_dims, axis, op,
+                                     broadcast_ndim, remap_axis);
+        }
+        else {
+            retval = _parse_axes_arg(ufunc, core_num_dims, axes, op,
+                                     broadcast_ndim, remap_axis);
+        }
         if(retval < 0) {
             goto fail;
         }
@@ -2222,12 +2499,13 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     j = broadcast_ndim;
     for (i = 0; i < nop; ++i) {
         int n;
+
         if (op[i]) {
             /*
              * Note that n may be negative if broadcasting
              * extends into the core dimensions.
              */
-            n = PyArray_NDIM(op[i]) - ufunc->core_num_dims[i];
+            n = PyArray_NDIM(op[i]) - core_num_dims[i];
         }
         else {
             n = broadcast_ndim;
@@ -2251,10 +2529,15 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
         /* Except for when it belongs to this output */
         if (i >= nin) {
             int dim_offset = ufunc->core_offsets[i];
-            int num_dims = ufunc->core_num_dims[i];
-            /* Fill in 'iter_shape' and 'op_axes' for this output */
+            int num_dims = core_num_dims[i];
+            /*
+             * Fill in 'iter_shape' and 'op_axes' for the core dimensions
+             * of this output. Here, we have to be careful: if keepdims
+             * was used, then this axis is not a real core dimension,
+             * but is being added back for broadcasting, so its size is 1.
+             */
             for (idim = 0; idim < num_dims; ++idim) {
-                iter_shape[j] = core_dim_sizes[
+                iter_shape[j] = keepdims ? 1 : core_dim_sizes[
                                         ufunc->core_dim_ixs[dim_offset + idim]];
                 op_axes_arrays[i][j] = REMAP_AXIS(i, n + idim);
                 ++j;
@@ -2300,19 +2583,15 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
 #endif
 
     if (subok) {
+        if (make_full_arg_tuple(&full_args, nin, nout, args, kwds) < 0) {
+            goto fail;
+        }
+
         /*
          * Get the appropriate __array_prepare__ function to call
          * for each output
          */
-        _find_array_prepare(args, kwds, arr_prep, nin, nout);
-
-        /* Set up arr_prep_args if a prep function was needed */
-        for (i = 0; i < nout; ++i) {
-            if (arr_prep[i] != NULL && arr_prep[i] != Py_None) {
-                arr_prep_args = make_arr_prep_args(nin, args, kwds);
-                break;
-            }
-        }
+        _find_array_prepare(full_args, arr_prep, nin, nout);
     }
 
     /* If the loop wants the arrays, provide them */
@@ -2459,7 +2738,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
 #endif
 
     /* Start with the floating-point exception flags cleared */
-    PyUFunc_clearfperr();
+    npy_clear_floatstatus_barrier((char*)&iter);
 
     NPY_UF_DBG_PRINT("Executing inner loop\n");
 
@@ -2543,7 +2822,11 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
         Py_XDECREF(arr_prep[i]);
     }
     Py_XDECREF(type_tup);
-    Py_XDECREF(arr_prep_args);
+    Py_XDECREF(extobj);
+    Py_XDECREF(axes);
+    Py_XDECREF(axis);
+    Py_XDECREF(full_args.in);
+    Py_XDECREF(full_args.out);
 
     NPY_UF_DBG_PRINT("Returning Success\n");
 
@@ -2561,7 +2844,11 @@ fail:
         Py_XDECREF(arr_prep[i]);
     }
     Py_XDECREF(type_tup);
-    Py_XDECREF(arr_prep_args);
+    Py_XDECREF(extobj);
+    Py_XDECREF(axes);
+    Py_XDECREF(axis);
+    Py_XDECREF(full_args.in);
+    Py_XDECREF(full_args.out);
     PyArray_free(remap_axis_memory);
     PyArray_free(remap_axis);
     return retval;
@@ -2599,14 +2886,13 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
      * This is either args, or args with the out= parameter from
      * kwds added appropriately.
      */
-    PyObject *arr_prep_args = NULL;
+    ufunc_full_args full_args = {NULL, NULL};
 
     int trivial_loop_ok = 0;
 
     NPY_ORDER order = NPY_KEEPORDER;
     /* Use the default assignment casting rule */
     NPY_CASTING casting = NPY_DEFAULT_ASSIGN_CASTING;
-    /* When provided, extobj and typetup contain borrowed references */
     PyObject *extobj = NULL, *type_tup = NULL;
 
     if (ufunc == NULL) {
@@ -2638,7 +2924,7 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
     /* Get all the arguments */
     retval = get_ufunc_arguments(ufunc, args, kwds,
                 op, &order, &casting, &extobj,
-                &type_tup, &subok, &wheremask, NULL);
+                &type_tup, &subok, &wheremask, NULL, NULL, NULL);
     if (retval < 0) {
         goto fail;
     }
@@ -2691,23 +2977,18 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
 #endif
 
     if (subok) {
+        if (make_full_arg_tuple(&full_args, nin, nout, args, kwds) < 0) {
+            goto fail;
+        }
         /*
          * Get the appropriate __array_prepare__ function to call
          * for each output
          */
-        _find_array_prepare(args, kwds, arr_prep, nin, nout);
-
-        /* Set up arr_prep_args if a prep function was needed */
-        for (i = 0; i < nout; ++i) {
-            if (arr_prep[i] != NULL && arr_prep[i] != Py_None) {
-                arr_prep_args = make_arr_prep_args(nin, args, kwds);
-                break;
-            }
-        }
+        _find_array_prepare(full_args, arr_prep, nin, nout);
     }
 
     /* Start with the floating-point exception flags cleared */
-    PyUFunc_clearfperr();
+    npy_clear_floatstatus_barrier((char*)&ufunc);
 
     /* Do the ufunc loop */
     if (need_fancy) {
@@ -2715,14 +2996,14 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
 
         retval = execute_fancy_ufunc_loop(ufunc, wheremask,
                             op, dtypes, order,
-                            buffersize, arr_prep, arr_prep_args);
+                            buffersize, arr_prep, full_args);
     }
     else {
         NPY_UF_DBG_PRINT("Executing legacy inner loop\n");
 
         retval = execute_legacy_ufunc_loop(ufunc, trivial_loop_ok,
                             op, dtypes, order,
-                            buffersize, arr_prep, arr_prep_args);
+                            buffersize, arr_prep, full_args);
     }
     if (retval < 0) {
         goto fail;
@@ -2742,7 +3023,9 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
         Py_XDECREF(arr_prep[i]);
     }
     Py_XDECREF(type_tup);
-    Py_XDECREF(arr_prep_args);
+    Py_XDECREF(extobj);
+    Py_XDECREF(full_args.in);
+    Py_XDECREF(full_args.out);
     Py_XDECREF(wheremask);
 
     NPY_UF_DBG_PRINT("Returning Success\n");
@@ -2758,7 +3041,9 @@ fail:
         Py_XDECREF(arr_prep[i]);
     }
     Py_XDECREF(type_tup);
-    Py_XDECREF(arr_prep_args);
+    Py_XDECREF(extobj);
+    Py_XDECREF(full_args.in);
+    Py_XDECREF(full_args.out);
     Py_XDECREF(wheremask);
 
     return retval;
@@ -3019,20 +3304,25 @@ finish_loop:
  */
 static PyArrayObject *
 PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
-        int naxes, int *axes, PyArray_Descr *odtype, int keepdims)
+        int naxes, int *axes, PyArray_Descr *odtype, int keepdims,
+        PyObject *initial)
 {
     int iaxes, ndim;
     npy_bool reorderable;
     npy_bool axis_flags[NPY_MAXDIMS];
     PyArray_Descr *dtype;
     PyArrayObject *result;
-    PyObject *identity = NULL;
+    PyObject *identity;
     const char *ufunc_name = ufunc_get_name_cstr(ufunc);
     /* These parameters come from a TLS global */
     int buffersize = 0, errormask = 0;
+    static PyObject *NoValue = NULL;
 
     NPY_UF_DBG_PRINT1("\nEvaluating ufunc %s.reduce\n", ufunc_name);
 
+    npy_cache_import("numpy", "_NoValue", &NoValue);
+    if (NoValue == NULL) return NULL;
+
     ndim = PyArray_NDIM(arr);
 
     /* Create an array of flags for reduction */
@@ -3056,19 +3346,28 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
     if (identity == NULL) {
         return NULL;
     }
-    /*
-     * The identity for a dynamic dtype like
-     * object arrays can't be used in general
-     */
-    if (identity != Py_None && PyArray_ISOBJECT(arr) && PyArray_SIZE(arr) != 0) {
+
+    /* Get the initial value */
+    if (initial == NULL || initial == NoValue) {
+        initial = identity;
+
+        /*
+        * The identity for a dynamic dtype like
+        * object arrays can't be used in general
+        */
+        if (initial != Py_None && PyArray_ISOBJECT(arr) && PyArray_SIZE(arr) != 0) {
+            Py_DECREF(initial);
+            initial = Py_None;
+            Py_INCREF(initial);
+        }
+    } else {
         Py_DECREF(identity);
-        identity = Py_None;
-        Py_INCREF(identity);
+        Py_INCREF(initial);  /* match the reference count in the if above */
     }
 
     /* Get the reduction dtype */
     if (reduce_type_resolver(ufunc, arr, odtype, &dtype) < 0) {
-        Py_DECREF(identity);
+        Py_DECREF(initial);
         return NULL;
     }
 
@@ -3076,12 +3375,12 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
                                    NPY_UNSAFE_CASTING,
                                    axis_flags, reorderable,
                                    keepdims, 0,
-                                   identity,
+                                   initial,
                                    reduce_loop,
                                    ufunc, buffersize, ufunc_name, errormask);
 
     Py_DECREF(dtype);
-    Py_DECREF(identity);
+    Py_DECREF(initial);
     return result;
 }
 
@@ -3472,7 +3771,7 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
                             op_axes_arrays[2]};
     npy_uint32 op_flags[3];
     int i, idim, ndim, otype_final;
-    int need_outer_iterator;
+    int need_outer_iterator = 0;
 
     NpyIter *iter = NULL;
 
@@ -3845,8 +4144,9 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
     PyArray_Descr *otype = NULL;
     PyArrayObject *out = NULL;
     int keepdims = 0;
+    PyObject *initial = NULL;
     static char *reduce_kwlist[] = {
-            "array", "axis", "dtype", "out", "keepdims", NULL};
+            "array", "axis", "dtype", "out", "keepdims", "initial", NULL};
     static char *accumulate_kwlist[] = {
             "array", "axis", "dtype", "out", NULL};
     static char *reduceat_kwlist[] = {
@@ -3918,13 +4218,13 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
         }
     }
     else {
-        if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&i:reduce",
+        if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&iO:reduce",
                                         reduce_kwlist,
                                         &op,
                                         &axes_in,
                                         PyArray_DescrConverter2, &otype,
                                         PyArray_OutputConverter, &out,
-                                        &keepdims)) {
+                                        &keepdims, &initial)) {
             goto fail;
         }
     }
@@ -4055,7 +4355,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
     switch(operation) {
     case UFUNC_REDUCE:
         ret = PyUFunc_Reduce(ufunc, mp, out, naxes, axes,
-                                          otype, keepdims);
+                                          otype, keepdims, initial);
         break;
     case UFUNC_ACCUMULATE:
         if (naxes != 1) {
@@ -4127,11 +4427,10 @@ fail:
  * should just have PyArray_Return called.
  */
 static void
-_find_array_wrap(PyObject *args, PyObject *kwds,
+_find_array_wrap(ufunc_full_args args, PyObject *kwds,
                 PyObject **output_wrap, int nin, int nout)
 {
-    Py_ssize_t nargs;
-    int i, idx_offset, start_idx;
+    int i;
     PyObject *obj;
     PyObject *wrap = NULL;
 
@@ -4151,7 +4450,7 @@ _find_array_wrap(PyObject *args, PyObject *kwds,
      * Determine the wrapping function given by the input arrays
      * (could be NULL).
      */
-    wrap = _find_array_method(args, nin, npy_um_str_array_wrap);
+    wrap = _find_array_method(args.in, npy_um_str_array_wrap);
 
     /*
      * For all the output arrays decide what to do.
@@ -4166,44 +4465,16 @@ _find_array_wrap(PyObject *args, PyObject *kwds,
      * done in that case.
      */
 handle_out:
-    nargs = PyTuple_GET_SIZE(args);
-    /* Default is using positional arguments */
-    obj = args;
-    idx_offset = nin;
-    start_idx = 0;
-    if (nin == nargs && kwds != NULL) {
-        /* There may be a keyword argument we can use instead */
-        obj = PyDict_GetItem(kwds, npy_um_str_out);
-        if (obj == NULL) {
-            /* No, go back to positional (even though there aren't any) */
-            obj = args;
-        }
-        else {
-            idx_offset = 0;
-            if (PyTuple_Check(obj)) {
-                /* If a tuple, must have all nout items */
-                nargs = nout;
-            }
-            else {
-                /* If the kwarg is not a tuple then it is an array (or None) */
-                output_wrap[0] = _get_output_array_method(
-                        obj, npy_um_str_array_wrap, wrap);
-                start_idx = 1;
-                nargs = 1;
-            }
+    if (args.out == NULL) {
+        for (i = 0; i < nout; i++) {
+            Py_XINCREF(wrap);
+            output_wrap[i] = wrap;
         }
     }
-
-    for (i = start_idx; i < nout; ++i) {
-        int j = idx_offset + i;
-
-        if (j < nargs) {
+    else {
+        for (i = 0; i < nout; i++) {
             output_wrap[i] = _get_output_array_method(
-                    PyTuple_GET_ITEM(obj, j), npy_um_str_array_wrap, wrap);
-        }
-        else {
-            output_wrap[i] = wrap;
-            Py_XINCREF(wrap);
+                PyTuple_GET_ITEM(args.out, i), npy_um_str_array_wrap, wrap);
         }
     }
 
@@ -4216,12 +4487,11 @@ static PyObject *
 ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 {
     int i;
-    PyTupleObject *ret;
     PyArrayObject *mps[NPY_MAXARGS];
     PyObject *retobj[NPY_MAXARGS];
     PyObject *wraparr[NPY_MAXARGS];
-    PyObject *res;
     PyObject *override = NULL;
+    ufunc_full_args full_args = {NULL, NULL};
     int errval;
 
     errval = PyUFunc_CheckOverride(ufunc, "__call__", args, kwds, &override);
@@ -4286,20 +4556,37 @@ ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
      * None --- array-object passed in don't call PyArray_Return
      * method --- the __array_wrap__ method to call.
      */
-    _find_array_wrap(args, kwds, wraparr, ufunc->nin, ufunc->nout);
+    if (make_full_arg_tuple(&full_args, ufunc->nin, ufunc->nout, args, kwds) < 0) {
+        goto fail;
+    }
+    _find_array_wrap(full_args, kwds, wraparr, ufunc->nin, ufunc->nout);
 
     /* wrap outputs */
     for (i = 0; i < ufunc->nout; i++) {
         int j = ufunc->nin+i;
         PyObject *wrap = wraparr[i];
 
-        if (wrap != NULL) {
-            if (wrap == Py_None) {
-                Py_DECREF(wrap);
-                retobj[i] = (PyObject *)mps[j];
-                continue;
+        if (wrap == NULL) {
+            /* default behavior */
+            retobj[i] = PyArray_Return(mps[j]);
+        }
+        else if (wrap == Py_None) {
+            Py_DECREF(wrap);
+            retobj[i] = (PyObject *)mps[j];
+        }
+        else {
+            PyObject *res;
+            PyObject *args_tup;
+
+            /* Call the method with appropriate context */
+            args_tup = _get_wrap_prepare_args(full_args);
+            if (args_tup == NULL) {
+                goto fail;
             }
-            res = PyObject_CallFunction(wrap, "O(OOi)", mps[j], ufunc, args, i);
+            res = PyObject_CallFunction(
+                wrap, "O(OOi)", mps[j], ufunc, args_tup, i);
+            Py_DECREF(args_tup);
+
             /* Handle __array_wrap__ that does not accept a context argument */
             if (res == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
                 PyErr_Clear();
@@ -4309,23 +4596,21 @@ ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
             if (res == NULL) {
                 goto fail;
             }
-            else {
-                Py_DECREF(mps[j]);
-                retobj[i] = res;
-                continue;
-            }
-        }
-        else {
-            /* default behavior */
-            retobj[i] = PyArray_Return(mps[j]);
-        }
 
+            Py_DECREF(mps[j]);
+            retobj[i] = res;
+        }
     }
 
+    Py_XDECREF(full_args.in);
+    Py_XDECREF(full_args.out);
+
     if (ufunc->nout == 1) {
         return retobj[0];
     }
     else {
+        PyTupleObject *ret;
+
         ret = (PyTupleObject *)PyTuple_New(ufunc->nout);
         for (i = 0; i < ufunc->nout; i++) {
             PyTuple_SET_ITEM(ret, i, retobj[i]);
@@ -4334,6 +4619,8 @@ ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
     }
 
 fail:
+    Py_XDECREF(full_args.in);
+    Py_XDECREF(full_args.out);
     for (i = ufunc->nin; i < ufunc->nargs; i++) {
         Py_XDECREF(mps[i]);
     }
@@ -4439,7 +4726,7 @@ PyUFunc_FromFuncAndData(PyUFuncGenericFunction *func, void **data,
                         const char *name, const char *doc, int unused)
 {
     return PyUFunc_FromFuncAndDataAndSignature(func, data, types, ntypes,
-        nin, nout, identity, name, doc, 0, NULL);
+        nin, nout, identity, name, doc, unused, NULL);
 }
 
 /*UFUNC_API*/
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 15da831b2..5567b9bbf 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -87,11 +87,12 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
     /* Keywords are ignored for now */
 
     PyObject *function, *pyname = NULL;
-    int nin, nout, i;
+    int nin, nout, i, nargs;
     PyUFunc_PyFuncData *fdata;
     PyUFuncObject *self;
-    char *fname, *str;
+    char *fname, *str, *types, *doc;
     Py_ssize_t fname_len = -1;
+    void * ptr, **data;
     int offset[2];
 
     if (!PyArg_ParseTuple(args, "Oii:frompyfunc", &function, &nin, &nout)) {
@@ -101,43 +102,7 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
         PyErr_SetString(PyExc_TypeError, "function must be callable");
         return NULL;
     }
-    if (nin + nout > NPY_MAXARGS) {
-        PyErr_Format(PyExc_ValueError,
-                     "Cannot construct a ufunc with more than %d operands "
-                     "(requested number were: inputs = %d and outputs = %d)",
-                     NPY_MAXARGS, nin, nout);
-        return NULL;
-    }
-    self = PyArray_malloc(sizeof(PyUFuncObject));
-    if (self == NULL) {
-        return NULL;
-    }
-    PyObject_Init((PyObject *)self, &PyUFunc_Type);
-
-    self->userloops = NULL;
-    self->nin = nin;
-    self->nout = nout;
-    self->nargs = nin + nout;
-    self->identity = PyUFunc_None;
-    self->functions = pyfunc_functions;
-    self->ntypes = 1;
-
-    /* generalized ufunc */
-    self->core_enabled = 0;
-    self->core_num_dim_ix = 0;
-    self->core_num_dims = NULL;
-    self->core_dim_ixs = NULL;
-    self->core_offsets = NULL;
-    self->core_signature = NULL;
-    self->op_flags = PyArray_malloc(sizeof(npy_uint32)*self->nargs);
-    if (self->op_flags == NULL) {
-        return PyErr_NoMemory();
-    }
-    memset(self->op_flags, 0, sizeof(npy_uint32)*self->nargs);
-    self->iter_flags = 0;
-
-    self->type_resolver = &object_ufunc_type_resolver;
-    self->legacy_inner_loop_selector = &object_ufunc_loop_selector;
+    nargs = nin + nout;
 
     pyname = PyObject_GetAttrString(function, "__name__");
     if (pyname) {
@@ -150,7 +115,7 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
     }
 
     /*
-     * self->ptr holds a pointer for enough memory for
+     * ptr will be assigned to self->ptr, holds a pointer for enough memory for
      * self->data[0] (fdata)
      * self->data
      * self->name
@@ -164,39 +129,51 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
     if (i) {
         offset[0] += (sizeof(void *) - i);
     }
-    offset[1] = self->nargs;
-    i = (self->nargs % sizeof(void *));
+    offset[1] = nargs;
+    i = (nargs % sizeof(void *));
     if (i) {
         offset[1] += (sizeof(void *)-i);
     }
-    self->ptr = PyArray_malloc(offset[0] + offset[1] + sizeof(void *) +
+    ptr = PyArray_malloc(offset[0] + offset[1] + sizeof(void *) +
                             (fname_len + 14));
-    if (self->ptr == NULL) {
+    if (ptr == NULL) {
         Py_XDECREF(pyname);
         return PyErr_NoMemory();
     }
-    Py_INCREF(function);
-    self->obj = function;
-    fdata = (PyUFunc_PyFuncData *)(self->ptr);
+    fdata = (PyUFunc_PyFuncData *)(ptr);
+    fdata->callable = function;
     fdata->nin = nin;
     fdata->nout = nout;
-    fdata->callable = function;
 
-    self->data = (void **)(((char *)self->ptr) + offset[0]);
-    self->data[0] = (void *)fdata;
-    self->types = (char *)self->data + sizeof(void *);
-    for (i = 0; i < self->nargs; i++) {
-        self->types[i] = NPY_OBJECT;
+    data = (void **)(((char *)ptr) + offset[0]);
+    data[0] = (void *)fdata;
+    types = (char *)data + sizeof(void *);
+    for (i = 0; i < nargs; i++) {
+        types[i] = NPY_OBJECT;
     }
-    str = self->types + offset[1];
+    str = types + offset[1];
     memcpy(str, fname, fname_len);
     memcpy(str+fname_len, " (vectorized)", 14);
-    self->name = str;
-
     Py_XDECREF(pyname);
 
     /* Do a better job someday */
-    self->doc = "dynamic ufunc based on a python function";
+    doc = "dynamic ufunc based on a python function";
+
+    self = (PyUFuncObject *)PyUFunc_FromFuncAndData(
+            (PyUFuncGenericFunction *)pyfunc_functions, data,
+            types, /* ntypes */ 1, nin, nout, PyUFunc_None,
+            str, doc, /* unused */ 0);
+
+    if (self == NULL) {
+        PyArray_free(ptr);
+        return NULL;
+    }
+    Py_INCREF(function);
+    self->obj = function;
+    self->ptr = ptr;
+
+    self->type_resolver = &object_ufunc_type_resolver;
+    self->legacy_inner_loop_selector = &object_ufunc_loop_selector;
 
     return (PyObject *)self;
 }
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index a927968a4..9755e7b36 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -223,22 +223,25 @@ def test_array_astype():
     b = a.astype('f4', subok=0, copy=False)
     assert_(a is b)
 
-    a = np.matrix([[0, 1, 2], [3, 4, 5]], dtype='f4')
+    class MyNDArray(np.ndarray):
+        pass
 
-    # subok=True passes through a matrix
+    a = np.array([[0, 1, 2], [3, 4, 5]], dtype='f4').view(MyNDArray)
+
+    # subok=True passes through a subclass
     b = a.astype('f4', subok=True, copy=False)
     assert_(a is b)
 
     # subok=True is default, and creates a subtype on a cast
     b = a.astype('i4', copy=False)
     assert_equal(a, b)
-    assert_equal(type(b), np.matrix)
+    assert_equal(type(b), MyNDArray)
 
-    # subok=False never returns a matrix
+    # subok=False never returns a subclass
     b = a.astype('f4', subok=False, copy=False)
     assert_equal(a, b)
     assert_(not (a is b))
-    assert_(type(b) is not np.matrix)
+    assert_(type(b) is not MyNDArray)
 
     # Make sure converting from string object to fixed length string
     # does not truncate.
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index dca2d2541..e433877e8 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -124,7 +124,7 @@ class TestDateTime(object):
         assert_(not np.can_cast('M8[h]', 'M8', casting='safe'))
 
     def test_compare_generic_nat(self):
-        # regression tests for GH6452
+        # regression tests for gh-6452
         assert_equal(np.datetime64('NaT'),
                      np.datetime64('2000') + np.timedelta64('NaT'))
         # nb. we may want to make NaT != NaT true in the future
@@ -236,18 +236,25 @@ class TestDateTime(object):
         # find "supertype" for non-dates and dates
 
         b = np.bool_(True)
-        dt = np.datetime64('1970-01-01', 'M')
-        arr = np.array([b, dt])
+        dm = np.datetime64('1970-01-01', 'M')
+        d = datetime.date(1970, 1, 1)
+        dt = datetime.datetime(1970, 1, 1, 12, 30, 40)
+
+        arr = np.array([b, dm])
         assert_equal(arr.dtype, np.dtype('O'))
 
-        dt = datetime.date(1970, 1, 1)
-        arr = np.array([b, dt])
+        arr = np.array([b, d])
         assert_equal(arr.dtype, np.dtype('O'))
 
-        dt = datetime.datetime(1970, 1, 1, 12, 30, 40)
         arr = np.array([b, dt])
         assert_equal(arr.dtype, np.dtype('O'))
 
+        arr = np.array([d, d]).astype('datetime64')
+        assert_equal(arr.dtype, np.dtype('M8[D]'))
+
+        arr = np.array([dt, dt]).astype('datetime64')
+        assert_equal(arr.dtype, np.dtype('M8[us]'))
+
     def test_timedelta_scalar_construction(self):
         # Construct with different units
         assert_equal(np.timedelta64(7, 'D'),
@@ -324,6 +331,24 @@ class TestDateTime(object):
         a = np.timedelta64(1, 'Y')
         assert_raises(TypeError, np.timedelta64, a, 'D')
         assert_raises(TypeError, np.timedelta64, a, 'm')
+        a = datetime.timedelta(seconds=3)
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+        a = datetime.timedelta(weeks=3)
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+        a = datetime.timedelta()
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+
+    def test_timedelta_object_array_conversion(self):
+        # Regression test for gh-11096
+        inputs = [datetime.timedelta(28),
+                  datetime.timedelta(30),
+                  datetime.timedelta(31)]
+        expected = np.array([28, 30, 31], dtype='timedelta64[D]')
+        actual = np.array(inputs, dtype='timedelta64[D]')
+        assert_equal(expected, actual)
 
     def test_timedelta_scalar_construction_units(self):
         # String construction detecting units
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 5d59d8226..60a7c72f7 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -134,6 +134,22 @@ class _VisibleDeprecationTestCase(_DeprecationTestCase):
     warning_cls = np.VisibleDeprecationWarning
 
 
+class TestNonTupleNDIndexDeprecation(object):
+    def test_basic(self):
+        a = np.zeros((5, 5))
+        with warnings.catch_warnings():
+            warnings.filterwarnings('always')
+            assert_warns(FutureWarning, a.__getitem__, [[0, 1], [0, 1]])
+            assert_warns(FutureWarning, a.__getitem__, [slice(None)])
+
+            warnings.filterwarnings('error')
+            assert_raises(FutureWarning, a.__getitem__, [[0, 1], [0, 1]])
+            assert_raises(FutureWarning, a.__getitem__, [slice(None)])
+
+            # a a[[0, 1]] always was advanced indexing, so no error/warning
+            a[[0, 1]]
+
+
 class TestRankDeprecation(_DeprecationTestCase):
     """Test that np.rank is deprecated. The function should simply be
     removed. The VisibleDeprecationWarning may become unnecessary.
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index 792b9e0a2..63e75ff7a 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -491,8 +491,16 @@ class TestEinSum(object):
         assert_array_equal(np.einsum('ij,ij->j', p, q, optimize=True),
                            [10.] * 2)
 
-        p = np.ones((1, 5))
-        q = np.ones((5, 5))
+        # a blas-compatible contraction broadcasting case which was failing
+        # for optimize=True (ticket #10930)
+        x = np.array([2., 3.])
+        y = np.array([4.])
+        assert_array_equal(np.einsum("i, i", x, y, optimize=False), 20.)
+        assert_array_equal(np.einsum("i, i", x, y, optimize=True), 20.)
+
+        # all-ones array was bypassing bug (ticket #10930)
+        p = np.ones((1, 5)) / 2
+        q = np.ones((5, 5)) / 2
         for optimize in (True, False):
             assert_array_equal(np.einsum("...ij,...jk->...ik", p, p,
                                          optimize=optimize),
@@ -500,7 +508,17 @@ class TestEinSum(object):
                                          optimize=optimize))
             assert_array_equal(np.einsum("...ij,...jk->...ik", p, q,
                                          optimize=optimize),
-                               np.full((1, 5), 5))
+                               np.full((1, 5), 1.25))
+
+        # Cases which were failing (gh-10899)
+        x = np.eye(2, dtype=dtype)
+        y = np.ones(2, dtype=dtype)
+        assert_array_equal(np.einsum("ji,i->", x, y, optimize=optimize),
+                           [2.])  # contig_contig_outstride0_two
+        assert_array_equal(np.einsum("i,ij->", y, x, optimize=optimize),
+                           [2.])  # stride0_contig_outstride0_two
+        assert_array_equal(np.einsum("ij,i->", x, y, optimize=optimize),
+                           [2.])  # contig_stride0_outstride0_two
 
     def test_einsum_sums_int8(self):
         self.check_einsum_sums('i1')
@@ -586,6 +604,17 @@ class TestEinSum(object):
                      [[[1,  3], [3,  9], [5, 15], [7, 21]],
                      [[8, 16], [16, 32], [24, 48], [32, 64]]])
 
+    def test_subscript_range(self):
+        # Issue #7741, make sure that all letters of Latin alphabet (both uppercase & lowercase) can be used
+        # when creating a subscript from arrays
+        a = np.ones((2, 3))
+        b = np.ones((3, 4))
+        np.einsum(a, [0, 20], b, [20, 2], [0, 2], optimize=False)
+        np.einsum(a, [0, 27], b, [27, 2], [0, 2], optimize=False)
+        np.einsum(a, [0, 51], b, [51, 2], [0, 2], optimize=False)
+        assert_raises(ValueError, lambda: np.einsum(a, [0, 52], b, [52, 2], [0, 2], optimize=False))
+        assert_raises(ValueError, lambda: np.einsum(a, [-1, 5], b, [5, 2], [-1, 2], optimize=False))
+        
     def test_einsum_broadcast(self):
         # Issue #2455 change in handling ellipsis
         # remove the 'middle broadcast' error
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index 65852e577..88f5deabc 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -576,19 +576,6 @@ class TestSubclasses(object):
         assert_(isinstance(s[[0, 1, 2]], SubClass))
         assert_(isinstance(s[s > 0], SubClass))
 
-    def test_matrix_fancy(self):
-        # The matrix class messes with the shape. While this is always
-        # weird (getitem is not used, it does not have setitem nor knows
-        # about fancy indexing), this tests gh-3110
-        m = np.matrix([[1, 2], [3, 4]])
-
-        assert_(isinstance(m[[0,1,0], :], np.matrix))
-
-        # gh-3110. Note the transpose currently because matrices do *not*
-        # support dimension fixing for fancy indexing correctly.
-        x = np.asmatrix(np.arange(50).reshape(5,10))
-        assert_equal(x[:2, np.array(-1)], x[:2, -1].T)
-
     def test_finalize_gets_full_info(self):
         # Array finalize should be called on the filled array.
         class SubClass(np.ndarray):
diff --git a/numpy/core/tests/test_longdouble.py b/numpy/core/tests/test_longdouble.py
index 513a71b99..cf50d5d5c 100644
--- a/numpy/core/tests/test_longdouble.py
+++ b/numpy/core/tests/test_longdouble.py
@@ -6,7 +6,7 @@ import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_array_equal, temppath,
     )
-from ._locales import CommaDecimalPointLocale
+from numpy.core.tests._locales import CommaDecimalPointLocale
 
 LD_INFO = np.finfo(np.longdouble)
 longdouble_longer_than_double = (LD_INFO.eps < np.finfo(np.double).eps)
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 00dfa6ada..3bc7e92c1 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -34,7 +34,7 @@ from numpy.testing import (
     assert_allclose, IS_PYPY, HAS_REFCOUNT, assert_array_less, runstring,
     SkipTest, temppath, suppress_warnings
     )
-from ._locales import CommaDecimalPointLocale
+from numpy.core.tests._locales import CommaDecimalPointLocale
 
 # Need to test an object that does not fully implement math interface
 from datetime import timedelta, datetime
@@ -573,6 +573,22 @@ class TestZeroRank(object):
         x = np.array(2)
         assert_raises(ValueError, np.add, x, [1], x)
 
+    def test_real_imag(self):
+        # contiguity checks are for gh-11245
+        x = np.array(1j)
+        xr = x.real
+        xi = x.imag
+
+        assert_equal(xr, np.array(0))
+        assert_(type(xr) is np.ndarray)
+        assert_equal(xr.flags.contiguous, True)
+        assert_equal(xr.flags.f_contiguous, True)
+
+        assert_equal(xi, np.array(1))
+        assert_(type(xi) is np.ndarray)
+        assert_equal(xi.flags.contiguous, True)
+        assert_equal(xi.flags.f_contiguous, True)
+
 
 class TestScalarIndexing(object):
     def setup(self):
@@ -1745,13 +1761,6 @@ class TestMethods(object):
         assert_equal(r, np.array([('a', 1), ('c', 3), ('b', 255), ('d', 258)],
                                  dtype=mydtype))
 
-    def test_sort_matrix_none(self):
-        a = np.matrix([[2, 1, 0]])
-        actual = np.sort(a, axis=None)
-        expected = np.matrix([[0, 1, 2]])
-        assert_equal(actual, expected)
-        assert_(type(expected) is np.matrix)
-
     def test_argsort(self):
         # all c scalar argsorts use the same code with different types
         # so it suffices to run a quick check with one type. The number
@@ -2497,14 +2506,6 @@ class TestMethods(object):
                 assert_array_equal(np.partition(d, kth)[kth], tgt,
                                    err_msg="data: %r\n kth: %r" % (d, kth))
 
-    def test_partition_matrix_none(self):
-        # gh-4301
-        a = np.matrix([[2, 1, 0]])
-        actual = np.partition(a, 1, axis=None)
-        expected = np.matrix([[0, 1, 2]])
-        assert_equal(actual, expected)
-        assert_(type(expected) is np.matrix)
-
     def test_argpartition_gh5524(self):
         #  A test for functionality of argpartition on lists.
         d = [6,7,3,2,9,0]
@@ -3332,7 +3333,39 @@ class TestBinop(object):
         with assert_raises(NotImplementedError):
             a ** 2
 
+    def test_pow_array_object_dtype(self):
+        # test pow on arrays of object dtype
+        class SomeClass(object):
+            def __init__(self, num=None):
+                self.num = num
+
+            # want to ensure a fast pow path is not taken
+            def __mul__(self, other):
+                raise AssertionError('__mul__ should not be called')
+
+            def __div__(self, other):
+                raise AssertionError('__div__ should not be called')
+            
+            def __pow__(self, exp):
+                return SomeClass(num=self.num ** exp)
+
+            def __eq__(self, other):
+                if isinstance(other, SomeClass):
+                    return self.num == other.num
+
+            __rpow__ = __pow__
 
+        def pow_for(exp, arr):
+            return np.array([x ** exp for x in arr])
+
+        obj_arr = np.array([SomeClass(1), SomeClass(2), SomeClass(3)])
+
+        assert_equal(obj_arr ** 0.5, pow_for(0.5, obj_arr))
+        assert_equal(obj_arr ** 0, pow_for(0, obj_arr))
+        assert_equal(obj_arr ** 1, pow_for(1, obj_arr))
+        assert_equal(obj_arr ** -1, pow_for(-1, obj_arr))
+        assert_equal(obj_arr ** 2, pow_for(2, obj_arr))
+        
 class TestTemporaryElide(object):
     # elision is only triggered on relatively large arrays
 
@@ -5279,13 +5312,6 @@ class TestDot(object):
         assert_equal(np.dot(b, a), res)
         assert_equal(np.dot(b, b), res)
 
-    def test_dot_scalar_and_matrix_of_objects(self):
-        # Ticket #2469
-        arr = np.matrix([1, 2], dtype=object)
-        desired = np.matrix([[3, 6]], dtype=object)
-        assert_equal(np.dot(arr, 3), desired)
-        assert_equal(np.dot(3, arr), desired)
-
     def test_accelerate_framework_sgemv_fix(self):
 
         def aligned_array(shape, align, dtype, order='C'):
@@ -5641,21 +5667,6 @@ class TestInner(object):
             assert_equal(np.inner(vec, sca), desired)
             assert_equal(np.inner(sca, vec), desired)
 
-    def test_inner_scalar_and_matrix(self):
-        for dt in np.typecodes['AllInteger'] + np.typecodes['AllFloat'] + '?':
-            sca = np.array(3, dtype=dt)[()]
-            arr = np.matrix([[1, 2], [3, 4]], dtype=dt)
-            desired = np.matrix([[3, 6], [9, 12]], dtype=dt)
-            assert_equal(np.inner(arr, sca), desired)
-            assert_equal(np.inner(sca, arr), desired)
-
-    def test_inner_scalar_and_matrix_of_objects(self):
-        # Ticket #4482
-        arr = np.matrix([1, 2], dtype=object)
-        desired = np.matrix([[3, 6]], dtype=object)
-        assert_equal(np.inner(arr, 3), desired)
-        assert_equal(np.inner(3, arr), desired)
-
     def test_vecself(self):
         # Ticket 844.
         # Inner product of a vector with itself segfaults or give
@@ -6522,20 +6533,17 @@ class TestNewBufferProtocol(object):
         a = np.empty((1,) * 32)
         self._check_roundtrip(a)
 
-    def _make_ctype(shape, scalar_type):
-        t = scalar_type
-        for dim in shape[::-1]:
-            t = dim * t
-        return t
-
-    # This creates deeply nested reference cycles that cause
-    # np.lib.tests.test_io.test_load_refcount to erroneously fail (gh-10891).
-    # Not making it a local ensure that the GC doesn't touch it during the tests
-    c_u8_33d = _make_ctype((1,)*33, ctypes.c_uint8)
-
+    @pytest.mark.skipif(sys.version_info < (2, 7, 7), reason="See gh-11115")
     def test_error_too_many_dims(self):
+        def make_ctype(shape, scalar_type):
+            t = scalar_type
+            for dim in shape[::-1]:
+                t = dim * t
+            return t
+
         # construct a memoryview with 33 dimensions
-        m = memoryview(self.c_u8_33d())
+        c_u8_33d = make_ctype((1,)*33, ctypes.c_uint8)
+        m = memoryview(c_u8_33d())
         assert_equal(m.ndim, 33)
 
         assert_raises_regex(
@@ -7294,7 +7302,7 @@ class TestWritebackIfCopy(object):
         # after resolve, the two arrays no longer reference each other
         assert_(arr_wb.ctypes.data != 0)
         assert_equal(arr_wb.base, None)
-        # assigning to arr_wb does not get transfered to arr
+        # assigning to arr_wb does not get transferred to arr
         arr_wb[...] = 100
         assert_equal(arr, -100)
 
@@ -7325,7 +7333,7 @@ class TestWritebackIfCopy(object):
         assert_equal(arr_wb.base, None)
         if HAS_REFCOUNT:
             assert_equal(arr_cnt, sys.getrefcount(arr))
-        # assigning to arr_wb does not get transfered to arr
+        # assigning to arr_wb does not get transferred to arr
         arr_wb[...] = 100
         assert_equal(arr, orig)
 
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index 77c26eacf..a0096efdb 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -811,7 +811,7 @@ def test_iter_nbo_align_contig():
         assert_equal(i.operands[0], a)
         i.operands[0][:] = 2
     assert_equal(au, [2]*6)
-    i = None # should not raise a DeprecationWarning
+    del i  # should not raise a warning
     # Byte order change by requesting NBO
     a = np.arange(6, dtype='f4')
     au = a.byteswap().newbyteorder()
@@ -1469,26 +1469,25 @@ def test_iter_allocate_output_types_scalar():
 
 def test_iter_allocate_output_subtype():
     # Make sure that the subtype with priority wins
+    class MyNDArray(np.ndarray):
+        __array_priority__ = 15
 
-    # matrix vs ndarray
-    a = np.matrix([[1, 2], [3, 4]])
+    # subclass vs ndarray
+    a = np.array([[1, 2], [3, 4]]).view(MyNDArray)
     b = np.arange(4).reshape(2, 2).T
     i = nditer([a, b, None], [],
-                    [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+               [['readonly'], ['readonly'], ['writeonly', 'allocate']])
     assert_equal(type(a), type(i.operands[2]))
-    assert_(type(b) != type(i.operands[2]))
+    assert_(type(b) is not type(i.operands[2]))
     assert_equal(i.operands[2].shape, (2, 2))
 
-    # matrix always wants things to be 2D
-    b = np.arange(4).reshape(1, 2, 2)
-    assert_raises(RuntimeError, nditer, [a, b, None], [],
-                    [['readonly'], ['readonly'], ['writeonly', 'allocate']])
-    # but if subtypes are disabled, the result can still work
+    # If subtypes are disabled, we should get back an ndarray.
     i = nditer([a, b, None], [],
-            [['readonly'], ['readonly'], ['writeonly', 'allocate', 'no_subtype']])
+               [['readonly'], ['readonly'],
+                ['writeonly', 'allocate', 'no_subtype']])
     assert_equal(type(b), type(i.operands[2]))
-    assert_(type(a) != type(i.operands[2]))
-    assert_equal(i.operands[2].shape, (1, 2, 2))
+    assert_(type(a) is not type(i.operands[2]))
+    assert_equal(i.operands[2].shape, (2, 2))
 
 def test_iter_allocate_output_errors():
     # Check that the iterator will throw errors for bad output allocations
@@ -2838,12 +2837,30 @@ def test_writebacks():
     it = nditer(au, [],
                  [['readwrite', 'updateifcopy']],
                  casting='equiv', op_dtypes=[np.dtype('f4')])
-    au = None
+    # reentering works
+    with it:
+        with it:
+            for x in it:
+                x[...] = 123
+
+    it = nditer(au, [],
+                 [['readwrite', 'updateifcopy']],
+                 casting='equiv', op_dtypes=[np.dtype('f4')])
+    # make sure exiting the inner context manager closes the iterator
+    with it:
+        with it:
+            for x in it:
+                x[...] = 123
+        assert_raises(ValueError, getattr, it, 'operands')
     # do not crash if original data array is decrefed
+    it = nditer(au, [],
+                 [['readwrite', 'updateifcopy']],
+                 casting='equiv', op_dtypes=[np.dtype('f4')])
+    del au
     with it:
         for x in it:
             x[...] = 123
-    # make sure we cannot reenter the iterand
+    # make sure we cannot reenter the closed iterator
     enter = it.__enter__
     assert_raises(ValueError, enter)
 
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 40cccd404..53486dc51 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -552,7 +552,6 @@ class TestFloatExceptions(object):
         self.assert_raises_fpe(fpeerr, flop, sc1, sc2[()])
         self.assert_raises_fpe(fpeerr, flop, sc1[()], sc2[()])
 
-    @pytest.mark.xfail(reason="See ticket #2350")
     def test_floating_exceptions(self):
         # Test basic arithmetic function errors
         with np.errstate(all='raise'):
@@ -905,7 +904,7 @@ class TestTypes(object):
             fi = np.finfo(dt)
             assert_(np.can_cast(fi.min, dt))
             assert_(np.can_cast(fi.max, dt))
-            
+
 
 # Custom exception class to test exception propagation in fromiter
 class NIterError(Exception):
@@ -2201,13 +2200,16 @@ class TestLikeFuncs(object):
             self.compare_array_value(dz, value, fill_value)
 
         # Test the 'subok' parameter
-        a = np.matrix([[1, 2], [3, 4]])
+        class MyNDArray(np.ndarray):
+            pass
+
+        a = np.array([[1, 2], [3, 4]]).view(MyNDArray)
 
         b = like_function(a, **fill_kwarg)
-        assert_(type(b) is np.matrix)
+        assert_(type(b) is MyNDArray)
 
         b = like_function(a, subok=False, **fill_kwarg)
-        assert_(type(b) is not np.matrix)
+        assert_(type(b) is not MyNDArray)
 
     def test_ones_like(self):
         self.check_like_function(np.ones_like, 1)
diff --git a/numpy/core/tests/test_print.py b/numpy/core/tests/test_print.py
index 746ad0e4b..433208748 100644
--- a/numpy/core/tests/test_print.py
+++ b/numpy/core/tests/test_print.py
@@ -4,7 +4,7 @@ import sys
 
 import numpy as np
 from numpy.testing import assert_, assert_equal, SkipTest
-from ._locales import CommaDecimalPointLocale
+from numpy.core.tests._locales import CommaDecimalPointLocale
 
 
 if sys.version_info[0] >= 3:
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index b3cb3e610..f8f75d9ea 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -224,6 +224,42 @@ class TestRegression(object):
         x = np.arange(10, dtype='>f8')
         assert_array_equal(ref, x)
 
+    def test_arange_inf_step(self):
+        ref = np.arange(0, 1, 10)
+        x = np.arange(0, 1, np.inf)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, 1, -10)
+        x = np.arange(0, 1, -np.inf)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, -1, -10)
+        x = np.arange(0, -1, -np.inf)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, -1, 10)
+        x = np.arange(0, -1, np.inf)
+        assert_array_equal(ref, x)
+
+    def test_arange_underflow_stop_and_step(self):
+        finfo = np.finfo(np.float64)
+
+        ref = np.arange(0, finfo.eps, 2 * finfo.eps)
+        x = np.arange(0, finfo.eps, finfo.max)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, finfo.eps, -2 * finfo.eps)
+        x = np.arange(0, finfo.eps, -finfo.max)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, -finfo.eps, -2 * finfo.eps)
+        x = np.arange(0, -finfo.eps, -finfo.max)
+        assert_array_equal(ref, x)
+
+        ref = np.arange(0, -finfo.eps, 2 * finfo.eps)
+        x = np.arange(0, -finfo.eps, finfo.max)
+        assert_array_equal(ref, x)
+
     def test_argmax(self):
         # Ticket #119
         a = np.random.normal(0, 1, (4, 5, 6, 7, 8))
@@ -2325,13 +2361,10 @@ class TestRegression(object):
 
     def test_void_item_memview(self):
         va = np.zeros(10, 'V4')
-        # for now, there is just a futurewarning
-        assert_warns(FutureWarning, va[:1].item)
-        # in the future, test we got a bytes copy:
-        #x = va[:1].item()
-        #va[0] = b'\xff\xff\xff\xff'
-        #del va
-        #assert_equal(x, b'\x00\x00\x00\x00')
+        x = va[:1].item()
+        va[0] = b'\xff\xff\xff\xff'
+        del va
+        assert_equal(x, b'\x00\x00\x00\x00')
 
     def test_structarray_title(self):
         # The following used to segfault on pypy, due to NPY_TITLE_KEY
diff --git a/numpy/core/tests/test_scalarprint.py b/numpy/core/tests/test_scalarprint.py
index 94d8294f1..472ff691d 100644
--- a/numpy/core/tests/test_scalarprint.py
+++ b/numpy/core/tests/test_scalarprint.py
@@ -4,10 +4,13 @@
 """
 from __future__ import division, absolute_import, print_function
 
-import tempfile
-import numpy as np
-from numpy.testing import assert_, assert_equal
+import code, sys
+import platform
+import pytest
 
+from tempfile import TemporaryFile
+import numpy as np
+from numpy.testing import assert_, assert_equal, suppress_warnings, dec
 
 class TestRealScalars(object):
     def test_str(self):
@@ -53,7 +56,7 @@ class TestRealScalars(object):
         # output to a "real file" (ie, not a StringIO). Make sure we don't
         # inherit it.
         x = np.double(0.1999999999999)
-        with tempfile.TemporaryFile('r+t') as f:
+        with TemporaryFile('r+t') as f:
             print(x, file=f)
             f.seek(0)
             output = f.read()
@@ -62,6 +65,37 @@ class TestRealScalars(object):
         # precision as '0.2', but we want numpy's np.double('0.1999999999999')
         # to print the unique value, '0.1999999999999'.
 
+        # gh-11031
+        # Only in the python2 interactive shell and when stdout is a "real"
+        # file, the output of the last command is printed to stdout without
+        # Py_PRINT_RAW (unlike the print statement) so `>>> x` and `>>> print
+        # x` are potentially different. Make sure they are the same. The only
+        # way I found to get prompt-like output is using an actual prompt from
+        # the 'code' module. Again, must use tempfile to get a "real" file.
+
+        # dummy user-input which enters one line and then ctrl-Ds.
+        def userinput():
+            yield 'np.sqrt(2)'
+            raise EOFError
+        gen = userinput()
+        input_func = lambda prompt="": next(gen)
+
+        with TemporaryFile('r+t') as fo, TemporaryFile('r+t') as fe:
+            orig_stdout, orig_stderr = sys.stdout, sys.stderr
+            sys.stdout, sys.stderr = fo, fe
+
+            # py2 code.interact sends irrelevant internal DeprecationWarnings
+            with suppress_warnings() as sup:
+                sup.filter(DeprecationWarning)
+                code.interact(local={'np': np}, readfunc=input_func, banner='')
+
+            sys.stdout, sys.stderr = orig_stdout, orig_stderr
+
+            fo.seek(0)
+            capture = fo.read().strip()
+
+        assert_equal(capture, repr(np.sqrt(2)))
+
     def test_dragon4(self):
         # these tests are adapted from Ryan Juckett's dragon4 implementation,
         # see dragon4.c for details.
@@ -218,6 +252,66 @@ class TestRealScalars(object):
                          "1.2" if tp != np.float16 else "1.2002")
             assert_equal(fpos(tp('1.'), trim='-'), "1")
 
+    @pytest.mark.skipif(not platform.machine().startswith("ppc64"),
+                        reason="only applies to ppc float128 values")
+    def test_ppc64_ibm_double_double128(self):
+        # check that the precision decreases once we get into the subnormal
+        # range. Unlike float64, this starts around 1e-292 instead of 1e-308,
+        # which happens when the first double is normal and the second is
+        # subnormal.
+        x = np.float128('2.123123123123123123123123123123123e-286')
+        got = [str(x/np.float128('2e' + str(i))) for i in range(0,40)]
+        expected = [
+            "1.06156156156156156156156156156157e-286",
+            "1.06156156156156156156156156156158e-287",
+            "1.06156156156156156156156156156159e-288",
+            "1.0615615615615615615615615615616e-289",
+            "1.06156156156156156156156156156157e-290",
+            "1.06156156156156156156156156156156e-291",
+            "1.0615615615615615615615615615616e-292",
+            "1.0615615615615615615615615615615e-293",
+            "1.061561561561561561561561561562e-294",
+            "1.06156156156156156156156156155e-295",
+            "1.0615615615615615615615615616e-296",
+            "1.06156156156156156156156156e-297",
+            "1.06156156156156156156156157e-298",
+            "1.0615615615615615615615616e-299",
+            "1.06156156156156156156156e-300",
+            "1.06156156156156156156155e-301",
+            "1.0615615615615615615616e-302",
+            "1.061561561561561561562e-303",
+            "1.06156156156156156156e-304",
+            "1.0615615615615615618e-305",
+            "1.06156156156156156e-306",
+            "1.06156156156156157e-307",
+            "1.0615615615615616e-308",
+            "1.06156156156156e-309",
+            "1.06156156156157e-310",
+            "1.0615615615616e-311",
+            "1.06156156156e-312",
+            "1.06156156154e-313",
+            "1.0615615616e-314",
+            "1.06156156e-315",
+            "1.06156155e-316",
+            "1.061562e-317",
+            "1.06156e-318",
+            "1.06155e-319",
+            "1.0617e-320",
+            "1.06e-321",
+            "1.04e-322",
+            "1e-323",
+            "0.0",
+            "0.0"]
+        assert_equal(got, expected)
+
+        # Note: we follow glibc behavior, but it (or gcc) might not be right.
+        # In particular we can get two values that print the same but are not
+        # equal:
+        a = np.float128('2')/np.float128('3')
+        b = np.float128(str(a))
+        assert_equal(str(a), str(b))
+        assert_(a != b)
+
     def float32_roundtrip(self):
         # gh-9360
         x = np.float32(1024 - 2**-14)
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index 1d91a651e..72b3451a4 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -364,10 +364,6 @@ def test_stack():
                         stack, [np.zeros((3, 3)), np.zeros(3)], axis=1)
     assert_raises_regex(ValueError, 'must have the same shape',
                         stack, [np.arange(2), np.arange(3)])
-    # np.matrix
-    m = np.matrix([[1, 2], [3, 4]])
-    assert_raises_regex(ValueError, 'shape too large to be a matrix',
-                        stack, [m, m])
 
 
 class TestBlock(object):
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 7a276c04d..ef9ced354 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -5,6 +5,7 @@ import itertools
 
 import numpy as np
 import numpy.core._umath_tests as umt
+import numpy.linalg._umath_linalg as uml
 import numpy.core._operand_flag_tests as opflag_tests
 import numpy.core._rational_tests as _rational_tests
 from numpy.testing import (
@@ -35,6 +36,10 @@ class TestUfuncKwargs(object):
         assert_raises(RuntimeError, np.add, 1, 2, signature='ii->i',
                       dtype=int)
 
+    def test_extobj_refcount(self):
+        # Should not segfault with USE_DEBUG.
+        assert_raises(TypeError, np.add, 1, 2, extobj=[4096], parrot=True)
+
 
 class TestUfunc(object):
     def test_pickle(self):
@@ -284,10 +289,16 @@ class TestUfunc(object):
     def test_signature(self):
         # the arguments to test_signature are: nin, nout, core_signature
         # pass
-        assert_equal(umt.test_signature(2, 1, "(i),(i)->()"), 1)
+        enabled, num_dims, ixs = umt.test_signature(2, 1, "(i),(i)->()")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (1,  1,  0))
+        assert_equal(ixs, (0, 0))
 
-        # pass. empty core signature; treat as plain ufunc (with trivial core)
-        assert_equal(umt.test_signature(2, 1, "(),()->()"), 0)
+        # empty core signature; treat as plain ufunc (with trivial core)
+        enabled, num_dims, ixs = umt.test_signature(2, 1, "(),()->()")
+        assert_equal(enabled, 0)
+        assert_equal(num_dims, (0,  0,  0))
+        assert_equal(ixs, ())
 
         # in the following calls, a ValueError should be raised because
         # of error in core signature
@@ -326,7 +337,10 @@ class TestUfunc(object):
             pass
 
         # more complicated names for variables
-        assert_equal(umt.test_signature(2, 1, "(i1,i2),(J_1)->(_kAB)"), 1)
+        enabled, num_dims, ixs = umt.test_signature(2, 1, "(i1,i2),(J_1)->(_kAB)")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (2, 1, 1))
+        assert_equal(ixs, (0, 1, 2, 3))
 
     def test_get_signature(self):
         assert_equal(umt.inner1d.signature, "(i),(i)->()")
@@ -494,6 +508,17 @@ class TestUfunc(object):
             d += d
             assert_almost_equal(d, 2. + 2j)
 
+    def test_sum_initial(self):
+        # Integer, single axis
+        assert_equal(np.sum([3], initial=2), 5)
+
+        # Floating point
+        assert_almost_equal(np.sum([0.2], initial=0.1), 0.3)
+
+        # Multiple non-adjacent axes
+        assert_equal(np.sum(np.ones((2, 3, 5), dtype=np.int64), axis=(0, 2), initial=2),
+                     [12, 12, 12])
+
     def test_inner1d(self):
         a = np.arange(6).reshape((2, 3))
         assert_array_equal(umt.inner1d(a, a), np.sum(a*a, axis=-1))
@@ -600,49 +625,49 @@ class TestUfunc(object):
 
     def test_axes_argument(self):
         # inner1d signature: '(i),(i)->()'
-        in1d = umt.inner1d
+        inner1d = umt.inner1d
         a = np.arange(27.).reshape((3, 3, 3))
         b = np.arange(10., 19.).reshape((3, 1, 3))
         # basic tests on inputs (outputs tested below with matrix_multiply).
-        c = in1d(a, b)
+        c = inner1d(a, b)
         assert_array_equal(c, (a * b).sum(-1))
         # default
-        c = in1d(a, b, axes=[(-1,), (-1,), ()])
+        c = inner1d(a, b, axes=[(-1,), (-1,), ()])
         assert_array_equal(c, (a * b).sum(-1))
         # integers ok for single axis.
-        c = in1d(a, b, axes=[-1, -1, ()])
+        c = inner1d(a, b, axes=[-1, -1, ()])
         assert_array_equal(c, (a * b).sum(-1))
         # mix fine
-        c = in1d(a, b, axes=[(-1,), -1, ()])
+        c = inner1d(a, b, axes=[(-1,), -1, ()])
         assert_array_equal(c, (a * b).sum(-1))
         # can omit last axis.
-        c = in1d(a, b, axes=[-1, -1])
+        c = inner1d(a, b, axes=[-1, -1])
         assert_array_equal(c, (a * b).sum(-1))
         # can pass in other types of integer (with __index__ protocol)
-        c = in1d(a, b, axes=[np.int8(-1), np.array(-1, dtype=np.int32)])
+        c = inner1d(a, b, axes=[np.int8(-1), np.array(-1, dtype=np.int32)])
         assert_array_equal(c, (a * b).sum(-1))
         # swap some axes
-        c = in1d(a, b, axes=[0, 0])
+        c = inner1d(a, b, axes=[0, 0])
         assert_array_equal(c, (a * b).sum(0))
-        c = in1d(a, b, axes=[0, 2])
+        c = inner1d(a, b, axes=[0, 2])
         assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1))
         # Check errors for improperly constructed axes arguments.
         # should have list.
-        assert_raises(TypeError, in1d, a, b, axes=-1)
+        assert_raises(TypeError, inner1d, a, b, axes=-1)
         # needs enough elements
-        assert_raises(ValueError, in1d, a, b, axes=[-1])
+        assert_raises(ValueError, inner1d, a, b, axes=[-1])
         # should pass in indices.
-        assert_raises(TypeError, in1d, a, b, axes=[-1.0, -1.0])
-        assert_raises(TypeError, in1d, a, b, axes=[(-1.0,), -1])
-        assert_raises(TypeError, in1d, a, b, axes=[None, 1])
+        assert_raises(TypeError, inner1d, a, b, axes=[-1.0, -1.0])
+        assert_raises(TypeError, inner1d, a, b, axes=[(-1.0,), -1])
+        assert_raises(TypeError, inner1d, a, b, axes=[None, 1])
         # cannot pass an index unless there is only one dimension
         # (output is wrong in this case)
-        assert_raises(TypeError, in1d, a, b, axes=[-1, -1, -1])
+        assert_raises(TypeError, inner1d, a, b, axes=[-1, -1, -1])
         # or pass in generally the wrong number of axes
-        assert_raises(ValueError, in1d, a, b, axes=[-1, -1, (-1,)])
-        assert_raises(ValueError, in1d, a, b, axes=[-1, (-2, -1), ()])
+        assert_raises(ValueError, inner1d, a, b, axes=[-1, -1, (-1,)])
+        assert_raises(ValueError, inner1d, a, b, axes=[-1, (-2, -1), ()])
         # axes need to have same length.
-        assert_raises(ValueError, in1d, a, b, axes=[0, 1])
+        assert_raises(ValueError, inner1d, a, b, axes=[0, 1])
 
         # matrix_multiply signature: '(m,n),(n,p)->(m,p)'
         mm = umt.matrix_multiply
@@ -696,6 +721,133 @@ class TestUfunc(object):
         assert_raises(ValueError, mm, z, z, out=z[:, 0])
         assert_raises(ValueError, mm, z[1], z, axes=[0, 1])
         assert_raises(ValueError, mm, z, z, out=z[0], axes=[0, 1])
+        # Regular ufuncs should not accept axes.
+        assert_raises(TypeError, np.add, 1., 1., axes=[0])
+        # should be able to deal with bad unrelated kwargs.
+        assert_raises(TypeError, mm, z, z, axes=[0, 1], parrot=True)
+
+    def test_axis_argument(self):
+        # inner1d signature: '(i),(i)->()'
+        inner1d = umt.inner1d
+        a = np.arange(27.).reshape((3, 3, 3))
+        b = np.arange(10., 19.).reshape((3, 1, 3))
+        c = inner1d(a, b)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, axis=-1)
+        assert_array_equal(c, (a * b).sum(-1))
+        out = np.zeros_like(c)
+        d = inner1d(a, b, axis=-1, out=out)
+        assert_(d is out)
+        assert_array_equal(d, c)
+        c = inner1d(a, b, axis=0)
+        assert_array_equal(c, (a * b).sum(0))
+        # Sanity checks on innerwt and cumsum.
+        a = np.arange(6).reshape((2, 3))
+        b = np.arange(10, 16).reshape((2, 3))
+        w = np.arange(20, 26).reshape((2, 3))
+        assert_array_equal(umt.innerwt(a, b, w, axis=0),
+                           np.sum(a * b * w, axis=0))
+        assert_array_equal(umt.cumsum(a, axis=0), np.cumsum(a, axis=0))
+        assert_array_equal(umt.cumsum(a, axis=-1), np.cumsum(a, axis=-1))
+        out = np.empty_like(a)
+        b = umt.cumsum(a, out=out, axis=0)
+        assert_(out is b)
+        assert_array_equal(b, np.cumsum(a, axis=0))
+        b = umt.cumsum(a, out=out, axis=1)
+        assert_(out is b)
+        assert_array_equal(b, np.cumsum(a, axis=-1))
+        # Check errors.
+        # Cannot pass in both axis and axes.
+        assert_raises(TypeError, inner1d, a, b, axis=0, axes=[0, 0])
+        # Not an integer.
+        assert_raises(TypeError, inner1d, a, b, axis=[0])
+        # more than 1 core dimensions.
+        mm = umt.matrix_multiply
+        assert_raises(TypeError, mm, a, b, axis=1)
+        # Output wrong size in axis.
+        out = np.empty((1, 2, 3), dtype=a.dtype)
+        assert_raises(ValueError, umt.cumsum, a, out=out, axis=0)
+        # Regular ufuncs should not accept axis.
+        assert_raises(TypeError, np.add, 1., 1., axis=0)
+
+    def test_keepdims_argument(self):
+        # inner1d signature: '(i),(i)->()'
+        inner1d = umt.inner1d
+        a = np.arange(27.).reshape((3, 3, 3))
+        b = np.arange(10., 19.).reshape((3, 1, 3))
+        c = inner1d(a, b)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, keepdims=False)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, keepdims=True)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=True))
+        out = np.zeros_like(c)
+        d = inner1d(a, b, keepdims=True, out=out)
+        assert_(d is out)
+        assert_array_equal(d, c)
+        # Now combined with axis and axes.
+        c = inner1d(a, b, axis=-1, keepdims=False)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=False))
+        c = inner1d(a, b, axis=-1, keepdims=True)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=True))
+        c = inner1d(a, b, axis=0, keepdims=False)
+        assert_array_equal(c, (a * b).sum(0, keepdims=False))
+        c = inner1d(a, b, axis=0, keepdims=True)
+        assert_array_equal(c, (a * b).sum(0, keepdims=True))
+        c = inner1d(a, b, axes=[(-1,), (-1,), ()], keepdims=False)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, axes=[(-1,), (-1,), (-1,)], keepdims=True)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=True))
+        c = inner1d(a, b, axes=[0, 0], keepdims=False)
+        assert_array_equal(c, (a * b).sum(0))
+        c = inner1d(a, b, axes=[0, 0, 0], keepdims=True)
+        assert_array_equal(c, (a * b).sum(0, keepdims=True))
+        c = inner1d(a, b, axes=[0, 2], keepdims=False)
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1))
+        c = inner1d(a, b, axes=[0, 2], keepdims=True)
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1,
+                                                             keepdims=True))
+        c = inner1d(a, b, axes=[0, 2, 2], keepdims=True)
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1,
+                                                             keepdims=True))
+        c = inner1d(a, b, axes=[0, 2, 0], keepdims=True)
+        assert_array_equal(c, (a * b.transpose(2, 0, 1)).sum(0, keepdims=True))
+        # Hardly useful, but should work.
+        c = inner1d(a, b, axes=[0, 2, 1], keepdims=True)
+        assert_array_equal(c, (a.transpose(1, 0, 2) * b.transpose(0, 2, 1))
+                           .sum(1, keepdims=True))
+        # Check with two core dimensions.
+        a = np.eye(3) * np.arange(4.)[:, np.newaxis, np.newaxis]
+        expected = uml.det(a)
+        c = uml.det(a, keepdims=False)
+        assert_array_equal(c, expected)
+        c = uml.det(a, keepdims=True)
+        assert_array_equal(c, expected[:, np.newaxis, np.newaxis])
+        a = np.eye(3) * np.arange(4.)[:, np.newaxis, np.newaxis]
+        expected_s, expected_l = uml.slogdet(a)
+        cs, cl = uml.slogdet(a, keepdims=False)
+        assert_array_equal(cs, expected_s)
+        assert_array_equal(cl, expected_l)
+        cs, cl = uml.slogdet(a, keepdims=True)
+        assert_array_equal(cs, expected_s[:, np.newaxis, np.newaxis])
+        assert_array_equal(cl, expected_l[:, np.newaxis, np.newaxis])
+        # Sanity check on innerwt.
+        a = np.arange(6).reshape((2, 3))
+        b = np.arange(10, 16).reshape((2, 3))
+        w = np.arange(20, 26).reshape((2, 3))
+        assert_array_equal(umt.innerwt(a, b, w, keepdims=True),
+                           np.sum(a * b * w, axis=-1, keepdims=True))
+        assert_array_equal(umt.innerwt(a, b, w, axis=0, keepdims=True),
+                           np.sum(a * b * w, axis=0, keepdims=True))
+        # Check errors.
+        # Not a boolean
+        assert_raises(TypeError, inner1d, a, b, keepdims='true')
+        # More than 1 core dimension, and core output dimensions.
+        mm = umt.matrix_multiply
+        assert_raises(TypeError, mm, a, b, keepdims=True)
+        assert_raises(TypeError, mm, a, b, keepdims=False)
+        # Regular ufuncs should not accept keepdims.
+        assert_raises(TypeError, np.add, 1., 1., keepdims=False)
 
     def test_innerwt(self):
         a = np.arange(6).reshape((2, 3))
@@ -788,6 +940,11 @@ class TestUfunc(object):
         # An output array is required to determine p with signature (n,d)->(p)
         assert_raises(ValueError, umt.euclidean_pdist, a)
 
+    def test_cumsum(self):
+        a = np.arange(10)
+        result = umt.cumsum(a)
+        assert_array_equal(result, a.cumsum())
+
     def test_object_logical(self):
         a = np.array([3, None, True, False, "test", ""], dtype=object)
         assert_equal(np.logical_or(a, None),
@@ -844,6 +1001,7 @@ class TestUfunc(object):
         assert_equal(np.min(a), False)
         assert_equal(np.array([[1]], dtype=object).sum(), 1)
         assert_equal(np.array([[[1, 2]]], dtype=object).sum((0, 1)), [1, 2])
+        assert_equal(np.array([1], dtype=object).sum(initial=1), 2)
 
     def test_object_array_accumulate_inplace(self):
         # Checks that in-place accumulates work, see also gh-7402
@@ -880,13 +1038,6 @@ class TestUfunc(object):
         np.add.reduceat(arr, np.arange(4), out=arr, axis=-1)
         assert_array_equal(arr, out)
 
-    def test_object_scalar_multiply(self):
-        # Tickets #2469 and #4482
-        arr = np.matrix([1, 2], dtype=object)
-        desired = np.matrix([[3, 6]], dtype=object)
-        assert_equal(np.multiply(arr, 3), desired)
-        assert_equal(np.multiply(3, arr), desired)
-
     def test_zerosize_reduction(self):
         # Test with default dtype and object dtype
         for a in [[], np.array([], dtype=object)]:
@@ -987,7 +1138,7 @@ class TestUfunc(object):
         assert_equal(np.sqrt(a, where=m), [1])
 
     def check_identityless_reduction(self, a):
-        # np.minimum.reduce is a identityless reduction
+        # np.minimum.reduce is an identityless reduction
 
         # Verify that it sees the zero at various positions
         a[...] = 1
@@ -1056,6 +1207,35 @@ class TestUfunc(object):
         a = a[1:, 1:, 1:]
         self.check_identityless_reduction(a)
 
+    def test_initial_reduction(self):
+        # np.minimum.reduce is an identityless reduction
+
+        # For cases like np.maximum(np.abs(...), initial=0)
+        # More generally, a supremum over non-negative numbers.
+        assert_equal(np.maximum.reduce([], initial=0), 0)
+
+        # For cases like reduction of an empty array over the reals.
+        assert_equal(np.minimum.reduce([], initial=np.inf), np.inf)
+        assert_equal(np.maximum.reduce([], initial=-np.inf), -np.inf)
+
+        # Random tests
+        assert_equal(np.minimum.reduce([5], initial=4), 4)
+        assert_equal(np.maximum.reduce([4], initial=5), 5)
+        assert_equal(np.maximum.reduce([5], initial=4), 5)
+        assert_equal(np.minimum.reduce([4], initial=5), 4)
+
+        # Check initial=None raises ValueError for both types of ufunc reductions
+        assert_raises(ValueError, np.minimum.reduce, [], initial=None)
+        assert_raises(ValueError, np.add.reduce, [], initial=None)
+
+        # Check that np._NoValue gives default behavior.
+        assert_equal(np.add.reduce([], initial=np._NoValue), 0)
+
+        # Check that initial kwarg behaves as intended for dtype=object
+        a = np.array([10], dtype=object)
+        res = np.add.reduce(a, initial=5)
+        assert_equal(res, 15)
+
     def test_identityless_reduction_nonreorderable(self):
         a = np.array([[8.0, 2.0, 2.0], [1.0, 0.5, 0.25]])
 
@@ -1407,15 +1587,18 @@ class TestUfunc(object):
         assert_equal(f(d, 0, None, None), r)
         assert_equal(f(d, 0, None, None, keepdims=False), r)
         assert_equal(f(d, 0, None, None, True), r.reshape((1,) + r.shape))
+        assert_equal(f(d, 0, None, None, False, 0), r)
+        assert_equal(f(d, 0, None, None, False, initial=0), r)
         # multiple keywords
         assert_equal(f(d, axis=0, dtype=None, out=None, keepdims=False), r)
         assert_equal(f(d, 0, dtype=None, out=None, keepdims=False), r)
         assert_equal(f(d, 0, None, out=None, keepdims=False), r)
+        assert_equal(f(d, 0, None, out=None, keepdims=False, initial=0), r)
 
         # too little
         assert_raises(TypeError, f)
         # too much
-        assert_raises(TypeError, f, d, 0, None, None, False, 1)
+        assert_raises(TypeError, f, d, 0, None, None, False, 0, 1)
         # invalid axis
         assert_raises(TypeError, f, d, "invalid")
         assert_raises(TypeError, f, d, axis="invalid")
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 9da6abd4b..3c0d1759a 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1328,6 +1328,17 @@ class TestMinMax(object):
         assert_equal(d.max(), d[0])
         assert_equal(d.min(), d[0])
 
+    def test_reduce_warns(self):
+        # gh 10370, 11029 Some compilers reorder the call to npy_getfloatstatus
+        # and put it before the call to an intrisic function that causes
+        # invalid status to be set. Also make sure warnings are emitted
+        for n in (2, 4, 8, 16, 32):
+            with suppress_warnings() as sup:
+                sup.record(RuntimeWarning)
+                for r in np.diagflat([np.nan] * n):
+                    assert_equal(np.min(r), np.nan)
+                assert_equal(len(sup.log), n)
+
 
 class TestAbsoluteNegative(object):
     def test_abs_neg_blocked(self):
@@ -1413,6 +1424,57 @@ class TestSpecialMethods(object):
         assert_equal(args[1], a)
         assert_equal(i, 0)
 
+    def test_wrap_and_prepare_out(self):
+        # Calling convention for out should not affect how special methods are
+        # called
+
+        class StoreArrayPrepareWrap(np.ndarray):
+            _wrap_args = None
+            _prepare_args = None
+            def __new__(cls):
+                return np.empty(()).view(cls)
+            def __array_wrap__(self, obj, context):
+                self._wrap_args = context[1]
+                return obj
+            def __array_prepare__(self, obj, context):
+                self._prepare_args = context[1]
+                return obj
+            @property
+            def args(self):
+                # We need to ensure these are fetched at the same time, before
+                # any other ufuncs are calld by the assertions
+                return (self._prepare_args, self._wrap_args)
+            def __repr__(self):
+                return "a"  # for short test output
+
+        def do_test(f_call, f_expected):
+            a = StoreArrayPrepareWrap()
+            f_call(a)
+            p, w = a.args
+            expected = f_expected(a)
+            try:
+                assert_equal(p, expected)
+                assert_equal(w, expected)
+            except AssertionError as e:
+                # assert_equal produces truly useless error messages
+                raise AssertionError("\n".join([
+                    "Bad arguments passed in ufunc call",
+                    " expected:              {}".format(expected),
+                    " __array_prepare__ got: {}".format(p),
+                    " __array_wrap__ got:    {}".format(w)
+                ]))
+
+        # method not on the out argument
+        do_test(lambda a: np.add(a, 0),              lambda a: (a, 0))
+        do_test(lambda a: np.add(a, 0, None),        lambda a: (a, 0))
+        do_test(lambda a: np.add(a, 0, out=None),    lambda a: (a, 0))
+        do_test(lambda a: np.add(a, 0, out=(None,)), lambda a: (a, 0))
+
+        # method on the out argument
+        do_test(lambda a: np.add(0, 0, a),           lambda a: (0, 0, a))
+        do_test(lambda a: np.add(0, 0, out=a),       lambda a: (0, 0, a))
+        do_test(lambda a: np.add(0, 0, out=(a,)),    lambda a: (0, 0, a))
+
     def test_wrap_with_iterable(self):
         # test fix for bug #1026:
 
@@ -1613,13 +1675,16 @@ class TestSpecialMethods(object):
         assert_equal(ncu.maximum(a, C()), 0)
 
     def test_ufunc_override(self):
-
+        # check override works even with instance with high priority.
         class A(object):
             def __array_ufunc__(self, func, method, *inputs, **kwargs):
                 return self, func, method, inputs, kwargs
 
+        class MyNDArray(np.ndarray):
+            __array_priority__ = 100
+
         a = A()
-        b = np.matrix([1])
+        b = np.array([1]).view(MyNDArray)
         res0 = np.multiply(a, b)
         res1 = np.multiply(b, b, out=a)
 
@@ -1745,6 +1810,7 @@ class TestSpecialMethods(object):
         assert_raises(TypeError, np.multiply, a)
         assert_raises(TypeError, np.multiply, a, a, a, a)
         assert_raises(TypeError, np.multiply, a, a, sig='a', signature='a')
+        assert_raises(TypeError, ncu_tests.inner1d, a, a, axis=0, axes=[0, 0])
 
         # reduce, positional args
         res = np.multiply.reduce(a, 'axis0', 'dtype0', 'out0', 'keep0')
@@ -1759,7 +1825,7 @@ class TestSpecialMethods(object):
 
         # reduce, kwargs
         res = np.multiply.reduce(a, axis='axis0', dtype='dtype0', out='out0',
-                                 keepdims='keep0')
+                                 keepdims='keep0', initial='init0')
         assert_equal(res[0], a)
         assert_equal(res[1], np.multiply)
         assert_equal(res[2], 'reduce')
@@ -1767,7 +1833,8 @@ class TestSpecialMethods(object):
         assert_equal(res[4], {'dtype':'dtype0',
                               'out': ('out0',),
                               'keepdims': 'keep0',
-                              'axis': 'axis0'})
+                              'axis': 'axis0',
+                              'initial': 'init0'})
 
         # reduce, output equal to None removed, but not other explicit ones,
         # even if they are at their default value.
@@ -1777,6 +1844,14 @@ class TestSpecialMethods(object):
         assert_equal(res[4], {'axis': 0, 'keepdims': True})
         res = np.multiply.reduce(a, None, out=(None,), dtype=None)
         assert_equal(res[4], {'axis': None, 'dtype': None})
+        res = np.multiply.reduce(a, 0, None, None, False, 2)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False, 'initial': 2})
+        # np._NoValue ignored for initial.
+        res = np.multiply.reduce(a, 0, None, None, False, np._NoValue)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False})
+        # None kept for initial.
+        res = np.multiply.reduce(a, 0, None, None, False, None)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False, 'initial': None})
 
         # reduce, wrong args
         assert_raises(ValueError, np.multiply.reduce, a, out=())
@@ -1866,6 +1941,7 @@ class TestSpecialMethods(object):
         # outer, wrong args
         assert_raises(TypeError, np.multiply.outer, a)
         assert_raises(TypeError, np.multiply.outer, a, a, a, a)
+        assert_raises(TypeError, np.multiply.outer, a, a, sig='a', signature='a')
 
         # at
         res = np.multiply.at(a, [4, 2], 'b0')
@@ -2620,7 +2696,7 @@ def test_nextafterf():
 
 @pytest.mark.skipif(np.finfo(np.double) == np.finfo(np.longdouble),
                     reason="long double is same as double")
-@pytest.mark.skipif(platform.machine().startswith("ppc64"),
+@pytest.mark.xfail(condition=platform.machine().startswith("ppc64"),
                     reason="IBM double double")
 def test_nextafterl():
     return _test_nextafter(np.longdouble)
@@ -2653,7 +2729,7 @@ def test_spacingf():
 
 @pytest.mark.skipif(np.finfo(np.double) == np.finfo(np.longdouble),
                     reason="long double is same as double")
-@pytest.mark.skipif(platform.machine().startswith("ppc64"),
+@pytest.mark.xfail(condition=platform.machine().startswith("ppc64"),
                     reason="IBM double double")
 def test_spacingl():
     return _test_spacing(np.longdouble)
diff --git a/numpy/distutils/command/config.py b/numpy/distutils/command/config.py
index 66d4ed58d..47bc496cf 100644
--- a/numpy/distutils/command/config.py
+++ b/numpy/distutils/command/config.py
@@ -101,8 +101,12 @@ Original exception was: %s, and the Compiler class was %s
         return ret
 
     def _compile (self, body, headers, include_dirs, lang):
-        return self._wrap_method(old_config._compile, lang,
-                                 (body, headers, include_dirs, lang))
+        src, obj = self._wrap_method(old_config._compile, lang,
+                                     (body, headers, include_dirs, lang))
+        # _compile in unixcompiler.py sometimes creates .d dependency files.
+        # Clean them up.
+        self.temp_files.append(obj + '.d')
+        return src, obj
 
     def _link (self, body,
                headers, include_dirs,
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index cb7414a04..41f0b1f61 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -256,6 +256,11 @@ def minrelpath(path):
         return ''
     return os.sep.join(l)
 
+def sorted_glob(fileglob):
+    """sorts output of python glob for http://bugs.python.org/issue30461
+    to allow extensions to have reproducible build results"""
+    return sorted(glob.glob(fileglob))
+
 def _fix_paths(paths, local_path, include_non_existing):
     assert is_sequence(paths), repr(type(paths))
     new_paths = []
@@ -263,8 +268,8 @@ def _fix_paths(paths, local_path, include_non_existing):
     for n in paths:
         if is_string(n):
             if '*' in n or '?' in n:
-                p = glob.glob(n)
-                p2 = glob.glob(njoin(local_path, n))
+                p = sorted_glob(n)
+                p2 = sorted_glob(njoin(local_path, n))
                 if p2:
                     new_paths.extend(p2)
                 elif p:
@@ -528,7 +533,7 @@ def _get_headers(directory_list):
     # get *.h files from list of directories
     headers = []
     for d in directory_list:
-        head = glob.glob(os.path.join(d, "*.h")) #XXX: *.hpp files??
+        head = sorted_glob(os.path.join(d, "*.h")) #XXX: *.hpp files??
         headers.extend(head)
     return headers
 
@@ -882,7 +887,7 @@ class Configuration(object):
                                  caller_level = 1):
         l = subpackage_name.split('.')
         subpackage_path = njoin([self.local_path]+l)
-        dirs = [_m for _m in glob.glob(subpackage_path) if os.path.isdir(_m)]
+        dirs = [_m for _m in sorted_glob(subpackage_path) if os.path.isdir(_m)]
         config_list = []
         for d in dirs:
             if not os.path.isfile(njoin(d, '__init__.py')):
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index 2a3ff2e52..65d7de316 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -385,6 +385,7 @@ def get_info(name, notfound_action=0):
           'blis': blis_info,                  # use blas_opt instead
           'lapack_mkl': lapack_mkl_info,      # use lapack_opt instead
           'blas_mkl': blas_mkl_info,          # use blas_opt instead
+          'accelerate': accelerate_info,      # use blas_opt instead
           'x11': x11_info,
           'fft_opt': fft_opt_info,
           'fftw': fftw_info,
@@ -1551,39 +1552,10 @@ class lapack_opt_info(system_info):
         if not atlas_info:
             atlas_info = get_info('atlas')
 
-        if sys.platform == 'darwin' \
-                and not os.getenv('_PYTHON_HOST_PLATFORM', None) \
-                and not (atlas_info or openblas_info or
-                                             lapack_mkl_info):
-            # Use the system lapack from Accelerate or vecLib under OSX
-            args = []
-            link_args = []
-            if get_platform()[-4:] == 'i386' or 'intel' in get_platform() or \
-               'x86_64' in get_platform() or \
-               'i386' in platform.platform():
-                intel = 1
-            else:
-                intel = 0
-            if os.path.exists('/System/Library/Frameworks'
-                              '/Accelerate.framework/'):
-                if intel:
-                    args.extend(['-msse3'])
-                else:
-                    args.extend(['-faltivec'])
-                link_args.extend(['-Wl,-framework', '-Wl,Accelerate'])
-            elif os.path.exists('/System/Library/Frameworks'
-                                '/vecLib.framework/'):
-                if intel:
-                    args.extend(['-msse3'])
-                else:
-                    args.extend(['-faltivec'])
-                link_args.extend(['-Wl,-framework', '-Wl,vecLib'])
-            if args:
-                self.set_info(extra_compile_args=args,
-                              extra_link_args=link_args,
-                              define_macros=[('NO_ATLAS_INFO', 3),
-                                             ('HAVE_CBLAS', None)])
-                return
+        accelerate_info = get_info('accelerate')
+        if accelerate_info and not atlas_info:
+            self.set_info(**accelerate_info)
+            return
 
         need_lapack = 0
         need_blas = 0
@@ -1659,43 +1631,10 @@ class blas_opt_info(system_info):
         if not atlas_info:
             atlas_info = get_info('atlas_blas')
 
-        if sys.platform == 'darwin' \
-                and not os.getenv('_PYTHON_HOST_PLATFORM', None) \
-                and not (atlas_info or openblas_info or
-                                             blas_mkl_info or blis_info):
-            # Use the system BLAS from Accelerate or vecLib under OSX
-            args = []
-            link_args = []
-            if get_platform()[-4:] == 'i386' or 'intel' in get_platform() or \
-               'x86_64' in get_platform() or \
-               'i386' in platform.platform():
-                intel = 1
-            else:
-                intel = 0
-            if os.path.exists('/System/Library/Frameworks'
-                              '/Accelerate.framework/'):
-                if intel:
-                    args.extend(['-msse3'])
-                else:
-                    args.extend(['-faltivec'])
-                args.extend([
-                    '-I/System/Library/Frameworks/vecLib.framework/Headers'])
-                link_args.extend(['-Wl,-framework', '-Wl,Accelerate'])
-            elif os.path.exists('/System/Library/Frameworks'
-                                '/vecLib.framework/'):
-                if intel:
-                    args.extend(['-msse3'])
-                else:
-                    args.extend(['-faltivec'])
-                args.extend([
-                    '-I/System/Library/Frameworks/vecLib.framework/Headers'])
-                link_args.extend(['-Wl,-framework', '-Wl,vecLib'])
-            if args:
-                self.set_info(extra_compile_args=args,
-                              extra_link_args=link_args,
-                              define_macros=[('NO_ATLAS_INFO', 3),
-                                             ('HAVE_CBLAS', None)])
-                return
+        accelerate_info = get_info('accelerate')
+        if accelerate_info and not atlas_info:
+            self.set_info(**accelerate_info)
+            return
 
         need_blas = 0
         info = {}
@@ -1939,6 +1878,58 @@ class blis_info(blas_info):
                     include_dirs=incl_dirs)
         self.set_info(**info)
 
+class accelerate_info(system_info):
+    section = 'accelerate'
+    notfounderror = BlasNotFoundError
+
+    def calc_info(self):
+        # Make possible to enable/disable from config file/env var
+        libraries = os.environ.get('ACCELERATE')
+        if libraries:
+            libraries = [libraries]
+        else:
+            libraries = self.get_libs('libraries', ['accelerate', 'veclib'])
+        libraries = [lib.strip().lower() for lib in libraries]
+
+        if (sys.platform == 'darwin' and
+                not os.getenv('_PYTHON_HOST_PLATFORM', None)):
+            # Use the system BLAS from Accelerate or vecLib under OSX
+            args = []
+            link_args = []
+            if get_platform()[-4:] == 'i386' or 'intel' in get_platform() or \
+               'x86_64' in get_platform() or \
+               'i386' in platform.platform():
+                intel = 1
+            else:
+                intel = 0
+            if (os.path.exists('/System/Library/Frameworks'
+                              '/Accelerate.framework/') and
+                    'accelerate' in libraries):
+                if intel:
+                    args.extend(['-msse3'])
+                else:
+                    args.extend(['-faltivec'])
+                args.extend([
+                    '-I/System/Library/Frameworks/vecLib.framework/Headers'])
+                link_args.extend(['-Wl,-framework', '-Wl,Accelerate'])
+            elif (os.path.exists('/System/Library/Frameworks'
+                                 '/vecLib.framework/') and
+                      'veclib' in libraries):
+                if intel:
+                    args.extend(['-msse3'])
+                else:
+                    args.extend(['-faltivec'])
+                args.extend([
+                    '-I/System/Library/Frameworks/vecLib.framework/Headers'])
+                link_args.extend(['-Wl,-framework', '-Wl,vecLib'])
+
+            if args:
+                self.set_info(extra_compile_args=args,
+                              extra_link_args=link_args,
+                              define_macros=[('NO_ATLAS_INFO', 3),
+                                             ('HAVE_CBLAS', None)])
+
+        return
 
 class blas_src_info(system_info):
     section = 'blas_src'
diff --git a/numpy/distutils/unixccompiler.py b/numpy/distutils/unixccompiler.py
index 6ed5eec6f..11b2cce52 100644
--- a/numpy/distutils/unixccompiler.py
+++ b/numpy/distutils/unixccompiler.py
@@ -61,8 +61,9 @@ def UnixCCompiler__compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts
         raise CompileError(msg)
 
     # add commandline flags to dependency file
-    with open(obj + '.d', 'a') as f:
-        f.write(_commandline_dep_string(cc_args, extra_postargs, pp_opts))
+    if deps:
+        with open(obj + '.d', 'a') as f:
+            f.write(_commandline_dep_string(cc_args, extra_postargs, pp_opts))
 
 replace_method(UnixCCompiler, '_compile', UnixCCompiler__compile)
 
diff --git a/numpy/f2py/src/fortranobject.c b/numpy/f2py/src/fortranobject.c
index dd2484eb4..78b06f066 100644
--- a/numpy/f2py/src/fortranobject.c
+++ b/numpy/f2py/src/fortranobject.c
@@ -539,7 +539,7 @@ void f2py_report_on_exit(int exit_flag,void *name) {
     fprintf(stderr,"(d) f2py call-back interface, %6d calls  : %8d msec\n",
             cb_passed_counter,cb_passed_time);
 
-    fprintf(stderr,"(e) wrapped (Fortran/C) functions (acctual) : %8d msec\n\n",
+    fprintf(stderr,"(e) wrapped (Fortran/C) functions (actual) : %8d msec\n\n",
             passed_call_time-cb_passed_call_time-cb_passed_time);
     fprintf(stderr,"Use -DF2PY_REPORT_ATEXIT_DISABLE to disable this message.\n");
     fprintf(stderr,"Exit status: %d\n",exit_flag);
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 27143e5c6..b604b8c52 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -205,7 +205,11 @@ class LineSplitter(object):
     #
 
     def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None):
+        delimiter = _decode_line(delimiter)
+        comments = _decode_line(comments)
+
         self.comments = comments
+
         # Delimiter is a character
         if (delimiter is None) or isinstance(delimiter, basestring):
             delimiter = delimiter or None
diff --git a/numpy/lib/arraypad.py b/numpy/lib/arraypad.py
index daaa68d06..e9ca9de4d 100644
--- a/numpy/lib/arraypad.py
+++ b/numpy/lib/arraypad.py
@@ -74,6 +74,35 @@ def _round_ifneeded(arr, dtype):
         arr.round(out=arr)
 
 
+def _slice_at_axis(shape, sl, axis):
+    """
+    Construct a slice tuple the length of shape, with sl at the specified axis
+    """
+    slice_tup = (slice(None),)
+    return slice_tup * axis + (sl,) + slice_tup * (len(shape) - axis - 1)
+
+
+def _slice_first(shape, n, axis):
+    """ Construct a slice tuple to take the first n elements along axis """
+    return _slice_at_axis(shape, slice(0, n), axis=axis)
+
+
+def _slice_last(shape, n, axis):
+    """ Construct a slice tuple to take the last n elements along axis """
+    dim = shape[axis]  # doing this explicitly makes n=0 work
+    return _slice_at_axis(shape, slice(dim - n, dim), axis=axis)
+
+
+def _do_prepend(arr, pad_chunk, axis):
+    return np.concatenate(
+        (pad_chunk.astype(arr.dtype, copy=False), arr), axis=axis)
+
+
+def _do_append(arr, pad_chunk, axis):
+    return np.concatenate(
+        (arr, pad_chunk.astype(arr.dtype, copy=False)), axis=axis)
+
+
 def _prepend_const(arr, pad_amt, val, axis=-1):
     """
     Prepend constant `val` along `axis` of `arr`.
@@ -100,12 +129,7 @@ def _prepend_const(arr, pad_amt, val, axis=-1):
         return arr
     padshape = tuple(x if i != axis else pad_amt
                      for (i, x) in enumerate(arr.shape))
-    if val == 0:
-        return np.concatenate((np.zeros(padshape, dtype=arr.dtype), arr),
-                              axis=axis)
-    else:
-        return np.concatenate(((np.zeros(padshape) + val).astype(arr.dtype),
-                               arr), axis=axis)
+    return _do_prepend(arr, np.full(padshape, val, dtype=arr.dtype), axis)
 
 
 def _append_const(arr, pad_amt, val, axis=-1):
@@ -134,12 +158,8 @@ def _append_const(arr, pad_amt, val, axis=-1):
         return arr
     padshape = tuple(x if i != axis else pad_amt
                      for (i, x) in enumerate(arr.shape))
-    if val == 0:
-        return np.concatenate((arr, np.zeros(padshape, dtype=arr.dtype)),
-                              axis=axis)
-    else:
-        return np.concatenate(
-            (arr, (np.zeros(padshape) + val).astype(arr.dtype)), axis=axis)
+    return _do_append(arr, np.full(padshape, val, dtype=arr.dtype), axis)
+
 
 
 def _prepend_edge(arr, pad_amt, axis=-1):
@@ -164,15 +184,9 @@ def _prepend_edge(arr, pad_amt, axis=-1):
     if pad_amt == 0:
         return arr
 
-    edge_slice = tuple(slice(None) if i != axis else 0
-                       for (i, x) in enumerate(arr.shape))
-
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-    edge_arr = arr[edge_slice].reshape(pad_singleton)
-    return np.concatenate((edge_arr.repeat(pad_amt, axis=axis), arr),
-                          axis=axis)
+    edge_slice = _slice_first(arr.shape, 1, axis=axis)
+    edge_arr = arr[edge_slice]
+    return _do_prepend(arr, edge_arr.repeat(pad_amt, axis=axis), axis)
 
 
 def _append_edge(arr, pad_amt, axis=-1):
@@ -198,15 +212,9 @@ def _append_edge(arr, pad_amt, axis=-1):
     if pad_amt == 0:
         return arr
 
-    edge_slice = tuple(slice(None) if i != axis else arr.shape[axis] - 1
-                       for (i, x) in enumerate(arr.shape))
-
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-    edge_arr = arr[edge_slice].reshape(pad_singleton)
-    return np.concatenate((arr, edge_arr.repeat(pad_amt, axis=axis)),
-                          axis=axis)
+    edge_slice = _slice_last(arr.shape, 1, axis=axis)
+    edge_arr = arr[edge_slice]
+    return _do_append(arr, edge_arr.repeat(pad_amt, axis=axis), axis)
 
 
 def _prepend_ramp(arr, pad_amt, end, axis=-1):
@@ -244,15 +252,10 @@ def _prepend_ramp(arr, pad_amt, end, axis=-1):
                                reverse=True).astype(np.float64)
 
     # Appropriate slicing to extract n-dimensional edge along `axis`
-    edge_slice = tuple(slice(None) if i != axis else 0
-                       for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_first(arr.shape, 1, axis=axis)
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-
-    # Extract edge, reshape to original rank, and extend along `axis`
-    edge_pad = arr[edge_slice].reshape(pad_singleton).repeat(pad_amt, axis)
+    # Extract edge, and extend along `axis`
+    edge_pad = arr[edge_slice].repeat(pad_amt, axis)
 
     # Linear ramp
     slope = (end - edge_pad) / float(pad_amt)
@@ -261,7 +264,7 @@ def _prepend_ramp(arr, pad_amt, end, axis=-1):
     _round_ifneeded(ramp_arr, arr.dtype)
 
     # Ramp values will most likely be float, cast them to the same type as arr
-    return np.concatenate((ramp_arr.astype(arr.dtype), arr), axis=axis)
+    return _do_prepend(arr, ramp_arr, axis)
 
 
 def _append_ramp(arr, pad_amt, end, axis=-1):
@@ -299,15 +302,10 @@ def _append_ramp(arr, pad_amt, end, axis=-1):
                                reverse=False).astype(np.float64)
 
     # Slice a chunk from the edge to calculate stats on
-    edge_slice = tuple(slice(None) if i != axis else -1
-                       for (i, x) in enumerate(arr.shape))
-
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_last(arr.shape, 1, axis=axis)
 
-    # Extract edge, reshape to original rank, and extend along `axis`
-    edge_pad = arr[edge_slice].reshape(pad_singleton).repeat(pad_amt, axis)
+    # Extract edge, and extend along `axis`
+    edge_pad = arr[edge_slice].repeat(pad_amt, axis)
 
     # Linear ramp
     slope = (end - edge_pad) / float(pad_amt)
@@ -316,7 +314,7 @@ def _append_ramp(arr, pad_amt, end, axis=-1):
     _round_ifneeded(ramp_arr, arr.dtype)
 
     # Ramp values will most likely be float, cast them to the same type as arr
-    return np.concatenate((arr, ramp_arr.astype(arr.dtype)), axis=axis)
+    return _do_append(arr, ramp_arr, axis)
 
 
 def _prepend_max(arr, pad_amt, num, axis=-1):
@@ -356,19 +354,13 @@ def _prepend_max(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    max_slice = tuple(slice(None) if i != axis else slice(num)
-                      for (i, x) in enumerate(arr.shape))
+    max_slice = _slice_first(arr.shape, num, axis=axis)
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-
-    # Extract slice, calculate max, reshape to add singleton dimension back
-    max_chunk = arr[max_slice].max(axis=axis).reshape(pad_singleton)
+    # Extract slice, calculate max
+    max_chunk = arr[max_slice].max(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `max_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((max_chunk.repeat(pad_amt, axis=axis), arr),
-                          axis=axis)
+    return _do_prepend(arr, max_chunk.repeat(pad_amt, axis=axis), axis)
 
 
 def _append_max(arr, pad_amt, num, axis=-1):
@@ -407,24 +399,16 @@ def _append_max(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        max_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        max_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         max_slice = tuple(slice(None) for x in arr.shape)
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-
-    # Extract slice, calculate max, reshape to add singleton dimension back
-    max_chunk = arr[max_slice].max(axis=axis).reshape(pad_singleton)
+    # Extract slice, calculate max
+    max_chunk = arr[max_slice].max(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `max_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((arr, max_chunk.repeat(pad_amt, axis=axis)),
-                          axis=axis)
+    return _do_append(arr, max_chunk.repeat(pad_amt, axis=axis), axis)
 
 
 def _prepend_mean(arr, pad_amt, num, axis=-1):
@@ -463,20 +447,14 @@ def _prepend_mean(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    mean_slice = tuple(slice(None) if i != axis else slice(num)
-                       for (i, x) in enumerate(arr.shape))
+    mean_slice = _slice_first(arr.shape, num, axis=axis)
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-
-    # Extract slice, calculate mean, reshape to add singleton dimension back
-    mean_chunk = arr[mean_slice].mean(axis).reshape(pad_singleton)
+    # Extract slice, calculate mean
+    mean_chunk = arr[mean_slice].mean(axis, keepdims=True)
     _round_ifneeded(mean_chunk, arr.dtype)
 
     # Concatenate `arr` with `mean_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((mean_chunk.repeat(pad_amt, axis).astype(arr.dtype),
-                           arr), axis=axis)
+    return _do_prepend(arr, mean_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _append_mean(arr, pad_amt, num, axis=-1):
@@ -515,25 +493,17 @@ def _append_mean(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        mean_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        mean_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         mean_slice = tuple(slice(None) for x in arr.shape)
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-
-    # Extract slice, calculate mean, reshape to add singleton dimension back
-    mean_chunk = arr[mean_slice].mean(axis=axis).reshape(pad_singleton)
+    # Extract slice, calculate mean
+    mean_chunk = arr[mean_slice].mean(axis=axis, keepdims=True)
     _round_ifneeded(mean_chunk, arr.dtype)
 
     # Concatenate `arr` with `mean_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate(
-        (arr, mean_chunk.repeat(pad_amt, axis).astype(arr.dtype)), axis=axis)
+    return _do_append(arr, mean_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _prepend_med(arr, pad_amt, num, axis=-1):
@@ -572,20 +542,14 @@ def _prepend_med(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    med_slice = tuple(slice(None) if i != axis else slice(num)
-                      for (i, x) in enumerate(arr.shape))
-
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
+    med_slice = _slice_first(arr.shape, num, axis=axis)
 
-    # Extract slice, calculate median, reshape to add singleton dimension back
-    med_chunk = np.median(arr[med_slice], axis=axis).reshape(pad_singleton)
+    # Extract slice, calculate median
+    med_chunk = np.median(arr[med_slice], axis=axis, keepdims=True)
     _round_ifneeded(med_chunk, arr.dtype)
 
     # Concatenate `arr` with `med_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate(
-        (med_chunk.repeat(pad_amt, axis).astype(arr.dtype), arr), axis=axis)
+    return _do_prepend(arr, med_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _append_med(arr, pad_amt, num, axis=-1):
@@ -624,25 +588,17 @@ def _append_med(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        med_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        med_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         med_slice = tuple(slice(None) for x in arr.shape)
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-
-    # Extract slice, calculate median, reshape to add singleton dimension back
-    med_chunk = np.median(arr[med_slice], axis=axis).reshape(pad_singleton)
+    # Extract slice, calculate median
+    med_chunk = np.median(arr[med_slice], axis=axis, keepdims=True)
     _round_ifneeded(med_chunk, arr.dtype)
 
     # Concatenate `arr` with `med_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate(
-        (arr, med_chunk.repeat(pad_amt, axis).astype(arr.dtype)), axis=axis)
+    return _do_append(arr, med_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _prepend_min(arr, pad_amt, num, axis=-1):
@@ -682,19 +638,13 @@ def _prepend_min(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    min_slice = tuple(slice(None) if i != axis else slice(num)
-                      for (i, x) in enumerate(arr.shape))
-
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
+    min_slice = _slice_first(arr.shape, num, axis=axis)
 
-    # Extract slice, calculate min, reshape to add singleton dimension back
-    min_chunk = arr[min_slice].min(axis=axis).reshape(pad_singleton)
+    # Extract slice, calculate min
+    min_chunk = arr[min_slice].min(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `min_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((min_chunk.repeat(pad_amt, axis=axis), arr),
-                          axis=axis)
+    return _do_prepend(arr, min_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _append_min(arr, pad_amt, num, axis=-1):
@@ -733,24 +683,16 @@ def _append_min(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        min_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        min_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         min_slice = tuple(slice(None) for x in arr.shape)
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-
-    # Extract slice, calculate min, reshape to add singleton dimension back
-    min_chunk = arr[min_slice].min(axis=axis).reshape(pad_singleton)
+    # Extract slice, calculate min
+    min_chunk = arr[min_slice].min(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `min_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((arr, min_chunk.repeat(pad_amt, axis=axis)),
-                          axis=axis)
+    return _do_append(arr, min_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _pad_ref(arr, pad_amt, method, axis=-1):
@@ -793,22 +735,14 @@ def _pad_ref(arr, pad_amt, method, axis=-1):
     # Prepended region
 
     # Slice off a reverse indexed chunk from near edge to pad `arr` before
-    ref_slice = tuple(slice(None) if i != axis else slice(pad_amt[0], 0, -1)
-                      for (i, x) in enumerate(arr.shape))
+    ref_slice = _slice_at_axis(arr.shape, slice(pad_amt[0], 0, -1), axis=axis)
 
     ref_chunk1 = arr[ref_slice]
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-    if pad_amt[0] == 1:
-        ref_chunk1 = ref_chunk1.reshape(pad_singleton)
-
     # Memory/computationally more expensive, only do this if `method='odd'`
     if 'odd' in method and pad_amt[0] > 0:
-        edge_slice1 = tuple(slice(None) if i != axis else 0
-                            for (i, x) in enumerate(arr.shape))
-        edge_chunk = arr[edge_slice1].reshape(pad_singleton)
+        edge_slice1 = _slice_first(arr.shape, 1, axis=axis)
+        edge_chunk = arr[edge_slice1]
         ref_chunk1 = 2 * edge_chunk - ref_chunk1
         del edge_chunk
 
@@ -818,19 +752,13 @@ def _pad_ref(arr, pad_amt, method, axis=-1):
     # Slice off a reverse indexed chunk from far edge to pad `arr` after
     start = arr.shape[axis] - pad_amt[1] - 1
     end = arr.shape[axis] - 1
-    ref_slice = tuple(slice(None) if i != axis else slice(start, end)
-                      for (i, x) in enumerate(arr.shape))
-    rev_idx = tuple(slice(None) if i != axis else slice(None, None, -1)
-                    for (i, x) in enumerate(arr.shape))
+    ref_slice = _slice_at_axis(arr.shape, slice(start, end), axis=axis)
+    rev_idx = _slice_at_axis(arr.shape, slice(None, None, -1), axis=axis)
     ref_chunk2 = arr[ref_slice][rev_idx]
 
-    if pad_amt[1] == 1:
-        ref_chunk2 = ref_chunk2.reshape(pad_singleton)
-
     if 'odd' in method:
-        edge_slice2 = tuple(slice(None) if i != axis else -1
-                            for (i, x) in enumerate(arr.shape))
-        edge_chunk = arr[edge_slice2].reshape(pad_singleton)
+        edge_slice2 = _slice_last(arr.shape, 1, axis=axis)
+        edge_chunk = arr[edge_slice2]
         ref_chunk2 = 2 * edge_chunk - ref_chunk2
         del edge_chunk
 
@@ -878,23 +806,14 @@ def _pad_sym(arr, pad_amt, method, axis=-1):
     # Prepended region
 
     # Slice off a reverse indexed chunk from near edge to pad `arr` before
-    sym_slice = tuple(slice(None) if i != axis else slice(0, pad_amt[0])
-                      for (i, x) in enumerate(arr.shape))
-    rev_idx = tuple(slice(None) if i != axis else slice(None, None, -1)
-                    for (i, x) in enumerate(arr.shape))
+    sym_slice = _slice_first(arr.shape, pad_amt[0], axis=axis)
+    rev_idx = _slice_at_axis(arr.shape, slice(None, None, -1), axis=axis)
     sym_chunk1 = arr[sym_slice][rev_idx]
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-    if pad_amt[0] == 1:
-        sym_chunk1 = sym_chunk1.reshape(pad_singleton)
-
     # Memory/computationally more expensive, only do this if `method='odd'`
     if 'odd' in method and pad_amt[0] > 0:
-        edge_slice1 = tuple(slice(None) if i != axis else 0
-                            for (i, x) in enumerate(arr.shape))
-        edge_chunk = arr[edge_slice1].reshape(pad_singleton)
+        edge_slice1 = _slice_first(arr.shape, 1, axis=axis)
+        edge_chunk = arr[edge_slice1]
         sym_chunk1 = 2 * edge_chunk - sym_chunk1
         del edge_chunk
 
@@ -902,19 +821,12 @@ def _pad_sym(arr, pad_amt, method, axis=-1):
     # Appended region
 
     # Slice off a reverse indexed chunk from far edge to pad `arr` after
-    start = arr.shape[axis] - pad_amt[1]
-    end = arr.shape[axis]
-    sym_slice = tuple(slice(None) if i != axis else slice(start, end)
-                      for (i, x) in enumerate(arr.shape))
+    sym_slice = _slice_last(arr.shape, pad_amt[1], axis=axis)
     sym_chunk2 = arr[sym_slice][rev_idx]
 
-    if pad_amt[1] == 1:
-        sym_chunk2 = sym_chunk2.reshape(pad_singleton)
-
     if 'odd' in method:
-        edge_slice2 = tuple(slice(None) if i != axis else -1
-                            for (i, x) in enumerate(arr.shape))
-        edge_chunk = arr[edge_slice2].reshape(pad_singleton)
+        edge_slice2 = _slice_last(arr.shape, 1, axis=axis)
+        edge_chunk = arr[edge_slice2]
         sym_chunk2 = 2 * edge_chunk - sym_chunk2
         del edge_chunk
 
@@ -959,29 +871,16 @@ def _pad_wrap(arr, pad_amt, axis=-1):
     # Prepended region
 
     # Slice off a reverse indexed chunk from near edge to pad `arr` before
-    start = arr.shape[axis] - pad_amt[0]
-    end = arr.shape[axis]
-    wrap_slice = tuple(slice(None) if i != axis else slice(start, end)
-                       for (i, x) in enumerate(arr.shape))
+    wrap_slice = _slice_last(arr.shape, pad_amt[0], axis=axis)
     wrap_chunk1 = arr[wrap_slice]
 
-    # Shape to restore singleton dimension after slicing
-    pad_singleton = tuple(x if i != axis else 1
-                          for (i, x) in enumerate(arr.shape))
-    if pad_amt[0] == 1:
-        wrap_chunk1 = wrap_chunk1.reshape(pad_singleton)
-
     ##########################################################################
     # Appended region
 
     # Slice off a reverse indexed chunk from far edge to pad `arr` after
-    wrap_slice = tuple(slice(None) if i != axis else slice(0, pad_amt[1])
-                       for (i, x) in enumerate(arr.shape))
+    wrap_slice = _slice_first(arr.shape, pad_amt[1], axis=axis)
     wrap_chunk2 = arr[wrap_slice]
 
-    if pad_amt[1] == 1:
-        wrap_chunk2 = wrap_chunk2.reshape(pad_singleton)
-
     # Concatenate `arr` with both chunks, extending along `axis`
     return np.concatenate((wrap_chunk1, arr, wrap_chunk2), axis=axis)
 
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index e8eda297f..4d3f35183 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -298,7 +298,7 @@ def _unique1d(ar, return_index=False, return_inverse=False,
     return ret
 
 
-def intersect1d(ar1, ar2, assume_unique=False):
+def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
     """
     Find the intersection of two arrays.
 
@@ -307,15 +307,28 @@ def intersect1d(ar1, ar2, assume_unique=False):
     Parameters
     ----------
     ar1, ar2 : array_like
-        Input arrays.
+        Input arrays. Will be flattened if not already 1D.
     assume_unique : bool
         If True, the input arrays are both assumed to be unique, which
         can speed up the calculation.  Default is False.
-
+    return_indices : bool
+        If True, the indices which correspond to the intersection of the 
+        two arrays are returned. The first instance of a value is used 
+        if there are multiple. Default is False. 
+    
+        .. versionadded:: 1.15.0    
+        
     Returns
     -------
     intersect1d : ndarray
         Sorted 1D array of common and unique elements.
+    comm1 : ndarray
+        The indices of the first occurrences of the common values in `ar1`.
+        Only provided if `return_indices` is True.
+    comm2 : ndarray
+        The indices of the first occurrences of the common values in `ar2`. 
+        Only provided if `return_indices` is True.
+
 
     See Also
     --------
@@ -332,14 +345,49 @@ def intersect1d(ar1, ar2, assume_unique=False):
     >>> from functools import reduce
     >>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
     array([3])
+    
+    To return the indices of the values common to the input arrays
+    along with the intersected values:
+    >>> x = np.array([1, 1, 2, 3, 4])
+    >>> y = np.array([2, 1, 4, 6])
+    >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
+    >>> x_ind, y_ind
+    (array([0, 2, 4]), array([1, 0, 2]))
+    >>> xy, x[x_ind], y[y_ind]
+    (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
+    
     """
     if not assume_unique:
-        # Might be faster than unique( intersect1d( ar1, ar2 ) )?
-        ar1 = unique(ar1)
-        ar2 = unique(ar2)
+        if return_indices:
+            ar1, ind1 = unique(ar1, return_index=True)
+            ar2, ind2 = unique(ar2, return_index=True)
+        else:
+            ar1 = unique(ar1)
+            ar2 = unique(ar2)
+    else:
+        ar1 = ar1.ravel()
+        ar2 = ar2.ravel()
+        
     aux = np.concatenate((ar1, ar2))
-    aux.sort()
-    return aux[:-1][aux[1:] == aux[:-1]]
+    if return_indices:
+        aux_sort_indices = np.argsort(aux, kind='mergesort')
+        aux = aux[aux_sort_indices]
+    else:
+        aux.sort()
+
+    mask = aux[1:] == aux[:-1]
+    int1d = aux[:-1][mask]
+
+    if return_indices:
+        ar1_indices = aux_sort_indices[:-1][mask]
+        ar2_indices = aux_sort_indices[1:][mask] - ar1.size
+        if not assume_unique:
+            ar1_indices = ind1[ar1_indices]
+            ar2_indices = ind2[ar2_indices]
+
+        return int1d, ar1_indices, ar2_indices
+    else:
+        return int1d
 
 def setxor1d(ar1, ar2, assume_unique=False):
     """
@@ -660,3 +708,4 @@ def setdiff1d(ar1, ar2, assume_unique=False):
         ar1 = unique(ar1)
         ar2 = unique(ar2)
     return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
+
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 363bb2101..23eac7e7d 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -1,5 +1,10 @@
 """
-Define a simple format for saving numpy arrays to disk with the full
+Binary serialization
+
+NPY format
+==========
+
+A simple format for saving numpy arrays to disk with the full
 information about them.
 
 The ``.npy`` format is the standard binary file format in NumPy for
@@ -143,8 +148,10 @@ data HEADER_LEN."
 
 Notes
 -----
-The ``.npy`` format, including reasons for creating it and a comparison of
-alternatives, is described fully in the "npy-format" NEP.
+The ``.npy`` format, including motivation for creating it and a comparison of
+alternatives, is described in the `"npy-format" NEP 
+<http://www.numpy.org/neps/nep-0001-npy-format.html>`_, however details have
+evolved with time and this document is more current.
 
 """
 from __future__ import division, absolute_import, print_function
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 72beef471..70aa654dc 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -109,9 +109,8 @@ def rot90(m, k=1, axes=(0,1)):
     >>> np.rot90(m, 1, (1,2))
     array([[[1, 3],
             [0, 2]],
-
-          [[5, 7],
-           [4, 6]]])
+           [[5, 7],
+            [4, 6]]])
 
     """
     axes = tuple(axes)
@@ -1633,9 +1632,9 @@ def disp(mesg, device=None, linefeed=True):
     Besides ``sys.stdout``, a file-like object can also be used as it has
     both required methods:
 
-    >>> from StringIO import StringIO
+    >>> from io import StringIO
     >>> buf = StringIO()
-    >>> np.disp('"Display" in a file', device=buf)
+    >>> np.disp(u'"Display" in a file', device=buf)
     >>> buf.getvalue()
     '"Display" in a file\\n'
 
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index d2a398a0a..2922b3a86 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -877,12 +877,6 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
         # bins is an integer
         bins = D*[bins]
 
-    # avoid rounding issues for comparisons when dealing with inexact types
-    if np.issubdtype(sample.dtype, np.inexact):
-        edge_dt = sample.dtype
-    else:
-        edge_dt = float
-
     # normalize the range argument
     if range is None:
         range = (None,) * D
@@ -896,13 +890,12 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
                 raise ValueError(
                     '`bins[{}]` must be positive, when an integer'.format(i))
             smin, smax = _get_outer_edges(sample[:,i], range[i])
-            edges[i] = np.linspace(smin, smax, bins[i] + 1, dtype=edge_dt)
+            edges[i] = np.linspace(smin, smax, bins[i] + 1)
         elif np.ndim(bins[i]) == 1:
-            edges[i] = np.asarray(bins[i], edge_dt)
-            # not just monotonic, due to the use of mindiff below
-            if np.any(edges[i][:-1] >= edges[i][1:]):
+            edges[i] = np.asarray(bins[i])
+            if np.any(edges[i][:-1] > edges[i][1:]):
                 raise ValueError(
-                    '`bins[{}]` must be strictly increasing, when an array'
+                    '`bins[{}]` must be monotonically increasing, when an array'
                     .format(i))
         else:
             raise ValueError(
@@ -911,13 +904,10 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
         nbin[i] = len(edges[i]) + 1  # includes an outlier on each end
         dedges[i] = np.diff(edges[i])
 
-    # Handle empty input.
-    if N == 0:
-        return np.zeros(nbin-2), edges
-
     # Compute the bin number each sample falls into.
     Ncount = tuple(
-        np.digitize(sample[:, i], edges[i])
+        # avoid np.digitize to work around gh-11022
+        np.searchsorted(edges[i], sample[:, i], side='right')
         for i in _range(D)
     )
 
@@ -925,16 +915,10 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
     # For the rightmost bin, we want values equal to the right edge to be
     # counted in the last bin, and not as an outlier.
     for i in _range(D):
-        # Rounding precision
-        mindiff = dedges[i].min()
-        if not np.isinf(mindiff):
-            decimal = int(-np.log10(mindiff)) + 6
-            # Find which points are on the rightmost edge.
-            not_smaller_than_edge = (sample[:, i] >= edges[i][-1])
-            on_edge = (np.around(sample[:, i], decimal) ==
-                       np.around(edges[i][-1], decimal))
-            # Shift these points one bin to the left.
-            Ncount[i][on_edge & not_smaller_than_edge] -= 1
+        # Find which points are on the rightmost edge.
+        on_edge = (sample[:, i] == edges[i][-1])
+        # Shift these points one bin to the left.
+        Ncount[i][on_edge] -= 1
 
     # Compute the sample indices in the flattened histogram matrix.
     # This raises an error if the array is too large.
diff --git a/numpy/lib/index_tricks.py b/numpy/lib/index_tricks.py
index 43fdc5627..d2139338e 100644
--- a/numpy/lib/index_tricks.py
+++ b/numpy/lib/index_tricks.py
@@ -201,7 +201,7 @@ class nd_grid(object):
                 slobj = [_nx.newaxis]*len(size)
                 for k in range(len(size)):
                     slobj[k] = slice(None, None)
-                    nn[k] = nn[k][slobj]
+                    nn[k] = nn[k][tuple(slobj)]
                     slobj[k] = _nx.newaxis
             return nn
         except (IndexError, TypeError):
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 67585443b..390927601 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -26,9 +26,11 @@ from numpy.compat import (
 
 if sys.version_info[0] >= 3:
     import pickle
+    from collections.abc import Mapping
 else:
     import cPickle as pickle
     from future_builtins import map
+    from collections import Mapping
 
 
 def loads(*args, **kwargs):
@@ -92,7 +94,7 @@ class BagObj(object):
 
         This also enables tab-completion in an interpreter or IPython.
         """
-        return object.__getattribute__(self, '_obj').keys()
+        return list(object.__getattribute__(self, '_obj').keys())
 
 
 def zipfile_factory(file, *args, **kwargs):
@@ -110,7 +112,7 @@ def zipfile_factory(file, *args, **kwargs):
     return zipfile.ZipFile(file, *args, **kwargs)
 
 
-class NpzFile(object):
+class NpzFile(Mapping):
     """
     NpzFile(fid)
 
@@ -216,6 +218,13 @@ class NpzFile(object):
     def __del__(self):
         self.close()
 
+    # Implement the Mapping ABC
+    def __iter__(self):
+        return iter(self.files)
+
+    def __len__(self):
+        return len(self.files)
+
     def __getitem__(self, key):
         # FIXME: This seems like it will copy strings around
         #   more than is strictly necessary.  The zipfile
@@ -225,11 +234,11 @@ class NpzFile(object):
         #   It would be better if the zipfile could read
         #   (or at least uncompress) the data
         #   directly into the array memory.
-        member = 0
+        member = False
         if key in self._files:
-            member = 1
+            member = True
         elif key in self.files:
-            member = 1
+            member = True
             key += '.npy'
         if member:
             bytes = self.zip.open(key)
@@ -245,31 +254,27 @@ class NpzFile(object):
         else:
             raise KeyError("%s is not a file in the archive" % key)
 
-    def __iter__(self):
-        return iter(self.files)
-
-    def items(self):
-        """
-        Return a list of tuples, with each tuple (filename, array in file).
-
-        """
-        return [(f, self[f]) for f in self.files]
-
-    def iteritems(self):
-        """Generator that returns tuples (filename, array in file)."""
-        for f in self.files:
-            yield (f, self[f])
 
-    def keys(self):
-        """Return files in the archive with a ``.npy`` extension."""
-        return self.files
+    if sys.version_info.major == 3:
+        # deprecate the python 2 dict apis that we supported by accident in
+        # python 3. We forgot to implement itervalues() at all in earlier
+        # versions of numpy, so no need to deprecated it here.
 
-    def iterkeys(self):
-        """Return an iterator over the files in the archive."""
-        return self.__iter__()
+        def iteritems(self):
+            # Numpy 1.15, 2018-02-20
+            warnings.warn(
+                "NpzFile.iteritems is deprecated in python 3, to match the "
+                "removal of dict.itertems. Use .items() instead.",
+                DeprecationWarning, stacklevel=2)
+            return self.items()
 
-    def __contains__(self, key):
-        return self.files.__contains__(key)
+        def iterkeys(self):
+            # Numpy 1.15, 2018-02-20
+            warnings.warn(
+                "NpzFile.iterkeys is deprecated in python 3, to match the "
+                "removal of dict.iterkeys. Use .keys() instead.",
+                DeprecationWarning, stacklevel=2)
+            return self.keys()
 
 
 def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
@@ -475,9 +480,7 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
 
     Notes
     -----
-    For a description of the ``.npy`` format, see the module docstring
-    of `numpy.lib.format` or the NumPy Enhancement Proposal
-    http://numpy.github.io/neps/npy-format.html
+    For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
 
     Examples
     --------
@@ -561,9 +564,7 @@ def savez(file, *args, **kwds):
     The ``.npz`` file format is a zipped archive of files named after the
     variables they contain.  The archive is not compressed and each file
     in the archive contains one variable in ``.npy`` format. For a
-    description of the ``.npy`` format, see `numpy.lib.format` or the
-    NumPy Enhancement Proposal
-    http://numpy.github.io/neps/npy-format.html
+    description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
 
     When opening the saved ``.npz`` file with `load` a `NpzFile` object is
     returned. This is a dictionary-like object which can be queried for
@@ -642,9 +643,9 @@ def savez_compressed(file, *args, **kwds):
     The ``.npz`` file format is a zipped archive of files named after the
     variables they contain.  The archive is compressed with
     ``zipfile.ZIP_DEFLATED`` and each file in the archive contains one variable
-    in ``.npy`` format. For a description of the ``.npy`` format, see
-    `numpy.lib.format` or the NumPy Enhancement Proposal
-    http://numpy.github.io/neps/npy-format.html
+    in ``.npy`` format. For a description of the ``.npy`` format, see 
+    :py:mod:`numpy.lib.format`.
+
 
     When opening the saved ``.npz`` file with `load` a `NpzFile` object is
     returned. This is a dictionary-like object which can be queried for
@@ -791,8 +792,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         the data-type.
     comments : str or sequence of str, optional
         The characters or list of characters used to indicate the start of a
-        comment. For backwards compatibility, byte strings will be decoded as
-        'latin1'. The default is '#'.
+        comment. None implies no comments. For backwards compatibility, byte
+        strings will be decoded as 'latin1'. The default is '#'.
     delimiter : str, optional
         The string used to separate values. For backwards compatibility, byte
         strings will be decoded as 'latin1'. The default is whitespace.
@@ -859,18 +860,18 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     Examples
     --------
     >>> from io import StringIO   # StringIO behaves like a file object
-    >>> c = StringIO("0 1\\n2 3")
+    >>> c = StringIO(u"0 1\\n2 3")
     >>> np.loadtxt(c)
     array([[ 0.,  1.],
            [ 2.,  3.]])
 
-    >>> d = StringIO("M 21 72\\nF 35 58")
+    >>> d = StringIO(u"M 21 72\\nF 35 58")
     >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
     ...                      'formats': ('S1', 'i4', 'f4')})
     array([('M', 21, 72.0), ('F', 35, 58.0)],
           dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')])
 
-    >>> c = StringIO("1,0,2\\n3,0,4")
+    >>> c = StringIO(u"1,0,2\\n3,0,4")
     >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
     >>> x
     array([ 1.,  3.])
@@ -1632,7 +1633,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     Comma delimited file with mixed dtype
 
-    >>> s = StringIO("1,1.3,abcde")
+    >>> s = StringIO(u"1,1.3,abcde")
     >>> data = np.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'),
     ... ('mystring','S5')], delimiter=",")
     >>> data
@@ -1659,7 +1660,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     An example with fixed-width columns
 
-    >>> s = StringIO("11.3abcde")
+    >>> s = StringIO(u"11.3abcde")
     >>> data = np.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'],
     ...     delimiter=[1,3,5])
     >>> data
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index 41b5e2f64..078608bbb 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -113,11 +113,6 @@ def poly(seq_of_zeros):
     >>> np.poly(P)
     array([ 1.        ,  0.        ,  0.16666667])
 
-    Or a square matrix object:
-
-    >>> np.poly(np.matrix(P))
-    array([ 1.        ,  0.        ,  0.16666667])
-
     Note how in all cases the leading coefficient is always 1.
 
     """
diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index e9ba38f46..c455bd93f 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -397,12 +397,13 @@ def merge_arrays(seqarrays, fill_value=-1, flatten=False,
     Notes
     -----
     * Without a mask, the missing value will be filled with something,
-    * depending on what its corresponding type:
-            -1      for integers
-            -1.0    for floating point numbers
-            '-'     for characters
-            '-1'    for strings
-            True    for boolean values
+      depending on what its corresponding type:
+
+      * ``-1``      for integers
+      * ``-1.0``    for floating point numbers
+      * ``'-'``     for characters
+      * ``'-1'``    for strings
+      * ``True``    for boolean values
     * XXX: I just obtained these values empirically
     """
     # Only one item in the input sequence ?
diff --git a/numpy/lib/scimath.py b/numpy/lib/scimath.py
index e07caf805..f1838fee6 100644
--- a/numpy/lib/scimath.py
+++ b/numpy/lib/scimath.py
@@ -555,7 +555,7 @@ def arctanh(x):
     --------
     >>> np.set_printoptions(precision=4)
 
-    >>> np.emath.arctanh(np.matrix(np.eye(2)))
+    >>> np.emath.arctanh(np.eye(2))
     array([[ Inf,   0.],
            [  0.,  Inf]])
     >>> np.emath.arctanh([1j])
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index 41ef28ef3..65104115a 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -16,10 +16,235 @@ from numpy.matrixlib.defmatrix import matrix  # this raises all the right alarm
 __all__ = [
     'column_stack', 'row_stack', 'dstack', 'array_split', 'split',
     'hsplit', 'vsplit', 'dsplit', 'apply_over_axes', 'expand_dims',
-    'apply_along_axis', 'kron', 'tile', 'get_array_wrap'
+    'apply_along_axis', 'kron', 'tile', 'get_array_wrap', 'take_along_axis',
+    'put_along_axis'
     ]
 
 
+def _make_along_axis_idx(arr_shape, indices, axis):
+	# compute dimensions to iterate over
+    if not _nx.issubdtype(indices.dtype, _nx.integer):
+        raise IndexError('`indices` must be an integer array')
+    if len(arr_shape) != indices.ndim:
+        raise ValueError(
+            "`indices` and `arr` must have the same number of dimensions")
+    shape_ones = (1,) * indices.ndim
+    dest_dims = list(range(axis)) + [None] + list(range(axis+1, indices.ndim))
+
+    # build a fancy index, consisting of orthogonal aranges, with the
+    # requested index inserted at the right location
+    fancy_index = []
+    for dim, n in zip(dest_dims, arr_shape):
+        if dim is None:
+            fancy_index.append(indices)
+        else:
+            ind_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(_nx.arange(n).reshape(ind_shape))
+
+    return tuple(fancy_index)
+
+
+def take_along_axis(arr, indices, axis):
+    """
+    Take values from the input array by matching 1d index and data slices.
+
+    This iterates over matching 1d slices oriented along the specified axis in
+    the index and data arrays, and uses the former to look up values in the
+    latter. These slices can be different lengths.
+
+    Functions returning an index along an axis, like `argsort` and
+    `argpartition`, produce suitable indices for this function.
+
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    arr: ndarray (Ni..., M, Nk...)
+        Source array
+    indices: ndarray (Ni..., J, Nk...)
+        Indices to take along each 1d slice of `arr`. This must match the
+        dimension of arr, but dimensions Ni and Nj only need to broadcast
+        against `arr`.
+    axis: int
+        The axis to take 1d slices along. If axis is None, the input array is
+        treated as if it had first been flattened to 1d, for consistency with
+        `sort` and `argsort`.
+
+    Returns
+    -------
+    out: ndarray (Ni..., J, Nk...)
+        The indexed result.
+
+    Notes
+    -----
+    This is equivalent to (but faster than) the following use of `ndindex` and
+    `s_`, which sets each of ``ii`` and ``kk`` to a tuple of indices::
+
+        Ni, M, Nk = a.shape[:axis], a.shape[axis], a.shape[axis+1:]
+        J = indices.shape[axis]  # Need not equal M
+        out = np.empty(Nk + (J,) + Nk)
+
+        for ii in ndindex(Ni):
+            for kk in ndindex(Nk):
+                a_1d       = a      [ii + s_[:,] + kk]
+                indices_1d = indices[ii + s_[:,] + kk]
+                out_1d     = out    [ii + s_[:,] + kk]
+                for j in range(J):
+                    out_1d[j] = a_1d[indices_1d[j]]
+
+    Equivalently, eliminating the inner loop, the last two lines would be::
+
+                out_1d[:] = a_1d[indices_1d]
+
+    See Also
+    --------
+    take : Take along an axis, using the same indices for every 1d slice
+    put_along_axis :
+        Put values into the destination array by matching 1d index and data slices
+
+    Examples
+    --------
+
+    For this sample array
+
+    >>> a = np.array([[10, 30, 20], [60, 40, 50]])
+
+    We can sort either by using sort directly, or argsort and this function
+
+    >>> np.sort(a, axis=1)
+    array([[10, 20, 30],
+           [40, 50, 60]])
+    >>> ai = np.argsort(a, axis=1); ai
+    array([[0, 2, 1],
+           [1, 2, 0]], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[10, 20, 30],
+           [40, 50, 60]])
+
+    The same works for max and min, if you expand the dimensions:
+
+    >>> np.expand_dims(np.max(a, axis=1), axis=1)
+    array([[30],
+           [60]])
+    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai
+    array([[1],
+           [0], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[30],
+           [60]])
+
+    If we want to get the max and min at the same time, we can stack the
+    indices first
+
+    >>> ai_min = np.expand_dims(np.argmin(a, axis=1), axis=1)
+    >>> ai_max = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai = np.concatenate([ai_min, ai_max], axis=axis)
+    >> ai
+    array([[0, 1],
+           [1, 0]], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[10, 30],
+           [40, 60]])
+    """
+    # normalize inputs
+    if axis is None:
+        arr = arr.flat
+        arr_shape = (len(arr),)  # flatiter has no .shape
+        axis = 0
+    else:
+        axis = normalize_axis_index(axis, arr.ndim)
+        arr_shape = arr.shape
+
+    # use the fancy index
+    return arr[_make_along_axis_idx(arr_shape, indices, axis)]
+
+
+def put_along_axis(arr, indices, values, axis):
+    """
+    Put values into the destination array by matching 1d index and data slices.
+
+    This iterates over matching 1d slices oriented along the specified axis in
+    the index and data arrays, and uses the former to place values into the
+    latter. These slices can be different lengths.
+
+    Functions returning an index along an axis, like `argsort` and
+    `argpartition`, produce suitable indices for this function.
+
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    arr: ndarray (Ni..., M, Nk...)
+        Destination array.
+    indices: ndarray (Ni..., J, Nk...)
+        Indices to change along each 1d slice of `arr`. This must match the
+        dimension of arr, but dimensions in Ni and Nj may be 1 to broadcast
+        against `arr`.
+    values: array_like (Ni..., J, Nk...)
+        values to insert at those indices. Its shape and dimension are
+        broadcast to match that of `indices`.
+    axis: int
+        The axis to take 1d slices along. If axis is None, the destination
+        array is treated as if a flattened 1d view had been created of it.
+
+    Notes
+    -----
+    This is equivalent to (but faster than) the following use of `ndindex` and
+    `s_`, which sets each of ``ii`` and ``kk`` to a tuple of indices::
+
+        Ni, M, Nk = a.shape[:axis], a.shape[axis], a.shape[axis+1:]
+        J = indices.shape[axis]  # Need not equal M
+
+        for ii in ndindex(Ni):
+            for kk in ndindex(Nk):
+                a_1d       = a      [ii + s_[:,] + kk]
+                indices_1d = indices[ii + s_[:,] + kk]
+                values_1d  = values [ii + s_[:,] + kk]
+                for j in range(J):
+                    a_1d[indices_1d[j]] = values_1d[j]
+
+    Equivalently, eliminating the inner loop, the last two lines would be::
+
+                a_1d[indices_1d] = values_1d
+
+    See Also
+    --------
+    take_along_axis :
+        Take values from the input array by matching 1d index and data slices
+
+    Examples
+    --------
+
+    For this sample array
+
+    >>> a = np.array([[10, 30, 20], [60, 40, 50]])
+
+    We can replace the maximum values with:
+
+    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai
+    array([[1],
+           [0]], dtype=int64)
+    >>> np.put_along_axis(a, ai, 99, axis=1)
+    >>> a
+    array([[10, 99, 20],
+           [99, 40, 50]])
+
+    """
+    # normalize inputs
+    if axis is None:
+        arr = arr.flat
+        axis = 0
+        arr_shape = (len(arr),)  # flatiter has no .shape
+    else:
+        axis = normalize_axis_index(axis, arr.ndim)
+        arr_shape = arr.shape
+
+    # use the fancy index
+    arr[_make_along_axis_idx(arr_shape, indices, axis)] = values
+
+
 def apply_along_axis(func1d, axis, arr, *args, **kwargs):
     """
     Apply a function to 1-D slices along the given axis.
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index 5f6c29a4d..b4888f1bd 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -53,6 +53,11 @@ class TestLineSplitter(object):
         test = LineSplitter(',')(strg)
         assert_equal(test, ['1', '2', '3', '4', '', '5'])
 
+        # gh-11028 bytes comment/delimiters should get encoded
+        strg = b" 1,2,3,4,,5 % test"
+        test = LineSplitter(delimiter=b',', comments=b'%')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
+
     def test_constant_fixed_width(self):
         "Test LineSplitter w/ fixed-width fields"
         strg = "  1  2  3  4     5   # test"
diff --git a/numpy/lib/tests/test_arraypad.py b/numpy/lib/tests/test_arraypad.py
index 8be49ce67..8ba0370b0 100644
--- a/numpy/lib/tests/test_arraypad.py
+++ b/numpy/lib/tests/test_arraypad.py
@@ -489,6 +489,19 @@ class TestConstant(object):
         )
         assert_allclose(test, expected)
 
+    def test_check_large_integers(self):
+        uint64_max = 2 ** 64 - 1
+        arr = np.full(5, uint64_max, dtype=np.uint64)
+        test = np.pad(arr, 1, mode="constant", constant_values=arr.min())
+        expected = np.full(7, uint64_max, dtype=np.uint64)
+        assert_array_equal(test, expected)
+
+        int64_max = 2 ** 63 - 1
+        arr = np.full(5, int64_max, dtype=np.int64)
+        test = np.pad(arr, 1, mode="constant", constant_values=arr.min())
+        expected = np.full(7, int64_max, dtype=np.int64)
+        assert_array_equal(test, expected)
+
 
 class TestLinearRamp(object):
     def test_check_simple(self):
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index 76c36c53e..dace5ade8 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -32,7 +32,46 @@ class TestSetOps(object):
         assert_array_equal(c, ed)
 
         assert_array_equal([], intersect1d([], []))
-
+        
+    def test_intersect1d_indices(self):
+        # unique inputs
+        a = np.array([1, 2, 3, 4]) 
+        b = np.array([2, 1, 4, 6])
+        c, i1, i2 = intersect1d(a, b, assume_unique=True, return_indices=True)
+        ee = np.array([1, 2, 4])
+        assert_array_equal(c, ee)
+        assert_array_equal(a[i1], ee)
+        assert_array_equal(b[i2], ee)
+        
+        # non-unique inputs
+        a = np.array([1, 2, 2, 3, 4, 3, 2])
+        b = np.array([1, 8, 4, 2, 2, 3, 2, 3])
+        c, i1, i2 = intersect1d(a, b, return_indices=True)
+        ef = np.array([1, 2, 3, 4])
+        assert_array_equal(c, ef)
+        assert_array_equal(a[i1], ef)
+        assert_array_equal(b[i2], ef)
+                
+        # non1d, unique inputs
+        a = np.array([[2, 4, 5, 6], [7, 8, 1, 15]])
+        b = np.array([[3, 2, 7, 6], [10, 12, 8, 9]])
+        c, i1, i2 = intersect1d(a, b, assume_unique=True, return_indices=True)
+        ui1 = np.unravel_index(i1, a.shape)
+        ui2 = np.unravel_index(i2, b.shape)
+        ea = np.array([2, 6, 7, 8])
+        assert_array_equal(ea, a[ui1])
+        assert_array_equal(ea, b[ui2])
+    
+        # non1d, not assumed to be uniqueinputs
+        a = np.array([[2, 4, 5, 6, 6], [4, 7, 8, 7, 2]])
+        b = np.array([[3, 2, 7, 7], [10, 12, 8, 7]])
+        c, i1, i2 = intersect1d(a, b, return_indices=True)
+        ui1 = np.unravel_index(i1, a.shape)
+        ui2 = np.unravel_index(i2, b.shape)
+        ea = np.array([2, 7, 8])
+        assert_array_equal(ea, a[ui1])
+        assert_array_equal(ea, b[ui2])
+        
     def test_setxor1d(self):
         a = np.array([5, 7, 1, 2])
         b = np.array([2, 4, 3, 1, 5])
@@ -74,8 +113,6 @@ class TestSetOps(object):
         assert_array_equal([1,7,8], ediff1d(two_elem, to_end=[7,8]))
         assert_array_equal([7,1], ediff1d(two_elem, to_begin=7))
         assert_array_equal([5,6,1], ediff1d(two_elem, to_begin=[5,6]))
-        assert(isinstance(ediff1d(np.matrix(1)), np.matrix))
-        assert(isinstance(ediff1d(np.matrix(1), to_begin=1), np.matrix))
 
     def test_isin(self):
         # the tests for in1d cover most of isin's behavior
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 43d62a7ff..4103a9eb3 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -287,9 +287,6 @@ class TestAverage(object):
         assert_almost_equal(y5.mean(0), average(y5, 0))
         assert_almost_equal(y5.mean(1), average(y5, 1))
 
-        y6 = np.matrix(rand(5, 5))
-        assert_array_equal(y6.mean(0), average(y6, 0))
-
     def test_weights(self):
         y = np.arange(10)
         w = np.arange(10)
@@ -357,14 +354,6 @@ class TestAverage(object):
         assert_equal(type(np.average(a)), subclass)
         assert_equal(type(np.average(a, weights=w)), subclass)
 
-        # also test matrices
-        a = np.matrix([[1,2],[3,4]])
-        w = np.matrix([[1,2],[3,4]])
-
-        r = np.average(a, axis=0, weights=w)
-        assert_equal(type(r), np.matrix)
-        assert_equal(r, [[2.5, 10.0/3]])
-
     def test_upcasting(self):
         types = [('i4', 'i4', 'f8'), ('i4', 'f4', 'f8'), ('f4', 'i4', 'f8'),
                  ('f4', 'f4', 'f4'), ('f4', 'f8', 'f8')]
@@ -1525,9 +1514,9 @@ class TestDigitize(object):
 class TestUnwrap(object):
 
     def test_simple(self):
-        # check that unwrap removes jumps greather that 2*pi
+        # check that unwrap removes jumps greater that 2*pi
         assert_array_equal(unwrap([1, 1 + 2 * np.pi]), [1, 1])
-        # check that unwrap maintans continuity
+        # check that unwrap maintains continuity
         assert_(np.all(diff(unwrap(rand(10) * 100)) < np.pi))
 
 
@@ -1623,16 +1612,6 @@ class TestTrapz(object):
         xm = np.ma.array(x, mask=mask)
         assert_almost_equal(trapz(y, xm), r)
 
-    def test_matrix(self):
-        # Test to make sure matrices give the same answer as ndarrays
-        x = np.linspace(0, 5)
-        y = x * x
-        r = trapz(y, x)
-        mx = np.matrix(x)
-        my = np.matrix(y)
-        mr = trapz(my, mx)
-        assert_almost_equal(mr, r)
-
 
 class TestSinc(object):
 
@@ -2759,7 +2738,7 @@ class TestQuantile(object):
         assert_equal(np.quantile(x, 0.5), 1.75)
 
     def test_no_p_overwrite(self):
-        # this is worth retesting, beause quantile does not make a copy
+        # this is worth retesting, because quantile does not make a copy
         p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
         p = p0.copy()
         np.quantile(np.arange(100.), p, interpolation="midpoint")
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 06daacbdc..e16ae12c2 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -253,7 +253,7 @@ class TestHistogram(object):
         one_nan = np.array([0, 1, np.nan])
         all_nan = np.array([np.nan, np.nan])
 
-        # the internal commparisons with NaN give warnings
+        # the internal comparisons with NaN give warnings
         sup = suppress_warnings()
         sup.filter(RuntimeWarning)
         with sup:
@@ -613,8 +613,6 @@ class TestHistogramdd(object):
         assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5])
         assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1])
         assert_raises(
-            ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]])
-        assert_raises(
             ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]])
         assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]]))
 
@@ -646,7 +644,7 @@ class TestHistogramdd(object):
         bins = [[0., 0.5, 1.0]]
         hist, _ = histogramdd(x, bins=bins)
         assert_(hist[0] == 0.0)
-        assert_(hist[1] == 1.)
+        assert_(hist[1] == 0.0)
         x = [1.0001]
         bins = [[0., 0.5, 1.0]]
         hist, _ = histogramdd(x, bins=bins)
@@ -660,3 +658,40 @@ class TestHistogramdd(object):
                       range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]])
         assert_raises(ValueError, histogramdd, vals,
                       range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]])
+
+    def test_equal_edges(self):
+        """ Test that adjacent entries in an edge array can be equal """
+        x = np.array([0, 1, 2])
+        y = np.array([0, 1, 2])
+        x_edges = np.array([0, 2, 2])
+        y_edges = 1
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        hist_expected = np.array([
+            [2.],
+            [1.],  # x == 2 falls in the final bin
+        ])
+        assert_equal(hist, hist_expected)
+
+    def test_edge_dtype(self):
+        """ Test that if an edge array is input, its type is preserved """
+        x = np.array([0, 10, 20])
+        y = x / 10
+        x_edges = np.array([0, 5, 15, 20])
+        y_edges = x_edges / 10
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        assert_equal(edges[0].dtype, x_edges.dtype)
+        assert_equal(edges[1].dtype, y_edges.dtype)
+
+    def test_large_integers(self):
+        big = 2**60  # Too large to represent with a full precision float
+
+        x = np.array([0], np.int64)
+        x_edges = np.array([-1, +1], np.int64)
+        y = big + x
+        y_edges = big + x_edges
+
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        assert_equal(hist[0, 0], 1)
diff --git a/numpy/lib/tests/test_index_tricks.py b/numpy/lib/tests/test_index_tricks.py
index f934e952a..315251daa 100644
--- a/numpy/lib/tests/test_index_tricks.py
+++ b/numpy/lib/tests/test_index_tricks.py
@@ -6,7 +6,7 @@ from numpy.testing import (
     assert_array_almost_equal, assert_raises, assert_raises_regex
     )
 from numpy.lib.index_tricks import (
-    mgrid, ndenumerate, fill_diagonal, diag_indices, diag_indices_from,
+    mgrid, ogrid, ndenumerate, fill_diagonal, diag_indices, diag_indices_from,
     index_exp, ndindex, r_, s_, ix_
     )
 
@@ -156,6 +156,15 @@ class TestGrid(object):
         assert_array_almost_equal(d[1, :, 1] - d[1, :, 0],
                                   0.2*np.ones(20, 'd'), 11)
 
+    def test_sparse(self):
+        grid_full   = mgrid[-1:1:10j, -2:2:10j]
+        grid_sparse = ogrid[-1:1:10j, -2:2:10j]
+
+        # sparse grids can be made dense by broadcasting
+        grid_broadcast = np.broadcast_arrays(*grid_sparse)
+        for f, b in zip(grid_full, grid_broadcast):
+            assert_equal(f, b)
+
 
 class TestConcatenator(object):
     def test_1d(self):
@@ -184,37 +193,6 @@ class TestConcatenator(object):
         assert_array_equal(d[:5, :], b)
         assert_array_equal(d[5:, :], c)
 
-    def test_matrix(self):
-        a = [1, 2]
-        b = [3, 4]
-
-        ab_r = np.r_['r', a, b]
-        ab_c = np.r_['c', a, b]
-
-        assert_equal(type(ab_r), np.matrix)
-        assert_equal(type(ab_c), np.matrix)
-
-        assert_equal(np.array(ab_r), [[1,2,3,4]])
-        assert_equal(np.array(ab_c), [[1],[2],[3],[4]])
-
-        assert_raises(ValueError, lambda: np.r_['rc', a, b])
-
-    def test_matrix_scalar(self):
-        r = np.r_['r', [1, 2], 3]
-        assert_equal(type(r), np.matrix)
-        assert_equal(np.array(r), [[1,2,3]])
-
-    def test_matrix_builder(self):
-        a = np.array([1])
-        b = np.array([2])
-        c = np.array([3])
-        d = np.array([4])
-        actual = np.r_['a, b; c, d']
-        expected = np.bmat([[a, b], [c, d]])
-
-        assert_equal(actual, expected)
-        assert_equal(type(actual), type(expected))
-
     def test_0d(self):
         assert_equal(r_[0, np.array(1), 2], [0, 1, 2])
         assert_equal(r_[[0, 1, 2], np.array(3)], [0, 1, 2, 3])
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
index e69d9dd7d..504372faf 100644
--- a/numpy/lib/tests/test_nanfunctions.py
+++ b/numpy/lib/tests/test_nanfunctions.py
@@ -113,42 +113,46 @@ class TestNanFunctions_MinMax(object):
         for f in self.nanfuncs:
             assert_(f(0.) == 0.)
 
-    def test_matrices(self):
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
         # Check that it works and that type and
         # shape are preserved
-        mat = np.matrix(np.eye(3))
+        mine = np.eye(3).view(MyNDArray)
         for f in self.nanfuncs:
-            res = f(mat, axis=0)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (1, 3))
-            res = f(mat, axis=1)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (3, 1))
-            res = f(mat)
-            assert_(np.isscalar(res))
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine)
+            assert_(res.shape == ())
+
         # check that rows of nan are dealt with for subclasses (#4628)
-        mat[1] = np.nan
+        mine[1] = np.nan
         for f in self.nanfuncs:
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter('always')
-                res = f(mat, axis=0)
-                assert_(isinstance(res, np.matrix))
+                res = f(mine, axis=0)
+                assert_(isinstance(res, MyNDArray))
                 assert_(not np.any(np.isnan(res)))
                 assert_(len(w) == 0)
 
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter('always')
-                res = f(mat, axis=1)
-                assert_(isinstance(res, np.matrix))
-                assert_(np.isnan(res[1, 0]) and not np.isnan(res[0, 0])
-                        and not np.isnan(res[2, 0]))
+                res = f(mine, axis=1)
+                assert_(isinstance(res, MyNDArray))
+                assert_(np.isnan(res[1]) and not np.isnan(res[0])
+                        and not np.isnan(res[2]))
                 assert_(len(w) == 1, 'no warning raised')
                 assert_(issubclass(w[0].category, RuntimeWarning))
 
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter('always')
-                res = f(mat)
-                assert_(np.isscalar(res))
+                res = f(mine)
+                assert_(res.shape == ())
                 assert_(res != np.nan)
                 assert_(len(w) == 0)
 
@@ -209,19 +213,22 @@ class TestNanFunctions_ArgminArgmax(object):
         for f in self.nanfuncs:
             assert_(f(0.) == 0.)
 
-    def test_matrices(self):
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
         # Check that it works and that type and
         # shape are preserved
-        mat = np.matrix(np.eye(3))
+        mine = np.eye(3).view(MyNDArray)
         for f in self.nanfuncs:
-            res = f(mat, axis=0)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (1, 3))
-            res = f(mat, axis=1)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (3, 1))
-            res = f(mat)
-            assert_(np.isscalar(res))
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine)
+            assert_(res.shape == ())
 
 
 class TestNanFunctions_IntTypes(object):
@@ -381,19 +388,27 @@ class SharedNanFunctionsTestsMixin(object):
         for f in self.nanfuncs:
             assert_(f(0.) == 0.)
 
-    def test_matrices(self):
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
         # Check that it works and that type and
         # shape are preserved
-        mat = np.matrix(np.eye(3))
+        array = np.eye(3)
+        mine = array.view(MyNDArray)
         for f in self.nanfuncs:
-            res = f(mat, axis=0)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (1, 3))
-            res = f(mat, axis=1)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (3, 1))
-            res = f(mat)
-            assert_(np.isscalar(res))
+            expected_shape = f(array, axis=0).shape
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
+            expected_shape = f(array, axis=1).shape
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
+            expected_shape = f(array).shape
+            res = f(mine)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
 
 
 class TestNanFunctions_SumProd(SharedNanFunctionsTestsMixin):
@@ -481,18 +496,6 @@ class TestNanFunctions_CumSumProd(SharedNanFunctionsTestsMixin):
                 res = f(d, axis=axis)
                 assert_equal(res.shape, (3, 5, 7, 11))
 
-    def test_matrices(self):
-        # Check that it works and that type and
-        # shape are preserved
-        mat = np.matrix(np.eye(3))
-        for f in self.nanfuncs:
-            for axis in np.arange(2):
-                res = f(mat, axis=axis)
-                assert_(isinstance(res, np.matrix))
-                assert_(res.shape == (3, 3))
-            res = f(mat)
-            assert_(res.shape == (1, 3*3))
-
     def test_result_values(self):
         for axis in (-2, -1, 0, 1, None):
             tgt = np.cumprod(_ndat_ones, axis=axis)
@@ -912,7 +915,7 @@ class TestNanFunctions_Quantile(object):
         assert_equal(np.nanquantile(x, 0.5), 1.75)
 
     def test_no_p_overwrite(self):
-        # this is worth retesting, beause quantile does not make a copy
+        # this is worth retesting, because quantile does not make a copy
         p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
         p = p0.copy()
         np.nanquantile(np.arange(100.), p, interpolation="midpoint")
diff --git a/numpy/lib/tests/test_shape_base.py b/numpy/lib/tests/test_shape_base.py
index 080fd066d..c95894f94 100644
--- a/numpy/lib/tests/test_shape_base.py
+++ b/numpy/lib/tests/test_shape_base.py
@@ -2,16 +2,106 @@ from __future__ import division, absolute_import, print_function
 
 import numpy as np
 import warnings
+import functools
 
 from numpy.lib.shape_base import (
     apply_along_axis, apply_over_axes, array_split, split, hsplit, dsplit,
-    vsplit, dstack, column_stack, kron, tile, expand_dims,
+    vsplit, dstack, column_stack, kron, tile, expand_dims, take_along_axis,
+    put_along_axis
     )
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_raises, assert_warns
     )
 
 
+def _add_keepdims(func):
+    """ hack in keepdims behavior into a function taking an axis """
+    @functools.wraps(func)
+    def wrapped(a, axis, **kwargs):
+        res = func(a, axis=axis, **kwargs)
+        if axis is None:
+            axis = 0  # res is now a scalar, so we can insert this anywhere
+        return np.expand_dims(res, axis=axis)
+    return wrapped
+
+
+class TestTakeAlongAxis(object):
+    def test_argequivalent(self):
+        """ Test it translates from arg<func> to <func> """
+        from numpy.random import rand
+        a = rand(3, 4, 5)
+
+        funcs = [
+            (np.sort, np.argsort, dict()),
+            (_add_keepdims(np.min), _add_keepdims(np.argmin), dict()),
+            (_add_keepdims(np.max), _add_keepdims(np.argmax), dict()),
+            (np.partition, np.argpartition, dict(kth=2)),
+        ]
+
+        for func, argfunc, kwargs in funcs:
+            for axis in list(range(a.ndim)) + [None]:
+                a_func = func(a, axis=axis, **kwargs)
+                ai_func = argfunc(a, axis=axis, **kwargs)
+                assert_equal(a_func, take_along_axis(a, ai_func, axis=axis))
+
+    def test_invalid(self):
+        """ Test it errors when indices has too few dimensions """
+        a = np.ones((10, 10))
+        ai = np.ones((10, 2), dtype=np.intp)
+
+        # sanity check
+        take_along_axis(a, ai, axis=1)
+
+        # not enough indices
+        assert_raises(ValueError, take_along_axis, a, np.array(1), axis=1)
+        # bool arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(bool), axis=1)
+        # float arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(float), axis=1)
+        # invalid axis
+        assert_raises(np.AxisError, take_along_axis, a, ai, axis=10)
+
+    def test_empty(self):
+        """ Test everything is ok with empty results, even with inserted dims """
+        a  = np.ones((3, 4, 5))
+        ai = np.ones((3, 0, 5), dtype=np.intp)
+
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, ai.shape)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.ones((1, 2, 5), dtype=np.intp)
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, (3, 2, 5))
+
+
+class TestPutAlongAxis(object):
+    def test_replace_max(self):
+        a_base = np.array([[10, 30, 20], [60, 40, 50]])
+
+        for axis in list(range(a_base.ndim)) + [None]:
+            # we mutate this in the loop
+            a = a_base.copy()
+
+            # replace the max with a small value
+            i_max = _add_keepdims(np.argmax)(a, axis=axis)
+            put_along_axis(a, i_max, -99, axis=axis)
+
+            # find the new minimum, which should max
+            i_min = _add_keepdims(np.argmin)(a, axis=axis)
+
+            assert_equal(i_min, i_max)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.arange(10, dtype=np.intp).reshape((1, 2, 5)) % 4
+        put_along_axis(a, ai, 20, axis=1)
+        assert_equal(take_along_axis(a, ai, axis=1), 20)
+
+
 class TestApplyAlongAxis(object):
     def test_simple(self):
         a = np.ones((20, 10), 'd')
@@ -29,19 +119,21 @@ class TestApplyAlongAxis(object):
                            [[27, 30, 33], [36, 39, 42], [45, 48, 51]])
 
     def test_preserve_subclass(self):
-        # this test is particularly malicious because matrix
-        # refuses to become 1d
         def double(row):
             return row * 2
-        m = np.matrix([[0, 1], [2, 3]])
-        expected = np.matrix([[0, 2], [4, 6]])
+
+        class MyNDArray(np.ndarray):
+            pass
+
+        m = np.array([[0, 1], [2, 3]]).view(MyNDArray)
+        expected = np.array([[0, 2], [4, 6]]).view(MyNDArray)
 
         result = apply_along_axis(double, 0, m)
-        assert_(isinstance(result, np.matrix))
+        assert_(isinstance(result, MyNDArray))
         assert_array_equal(result, expected)
 
         result = apply_along_axis(double, 1, m)
-        assert_(isinstance(result, np.matrix))
+        assert_(isinstance(result, MyNDArray))
         assert_array_equal(result, expected)
 
     def test_subclass(self):
@@ -79,7 +171,7 @@ class TestApplyAlongAxis(object):
 
     def test_axis_insertion(self, cls=np.ndarray):
         def f1to2(x):
-            """produces an assymmetric non-square matrix from x"""
+            """produces an asymmetric non-square matrix from x"""
             assert_equal(x.ndim, 1)
             return (x[::-1] * x[1:,None]).view(cls)
 
@@ -123,7 +215,7 @@ class TestApplyAlongAxis(object):
 
     def test_axis_insertion_ma(self):
         def f1to2(x):
-            """produces an assymmetric non-square matrix from x"""
+            """produces an asymmetric non-square matrix from x"""
             assert_equal(x.ndim, 1)
             res = x[::-1] * x[1:,None]
             return np.ma.masked_where(res%5==0, res)
@@ -492,16 +584,10 @@ class TestSqueeze(object):
 
 class TestKron(object):
     def test_return_type(self):
-        a = np.ones([2, 2])
-        m = np.asmatrix(a)
-        assert_equal(type(kron(a, a)), np.ndarray)
-        assert_equal(type(kron(m, m)), np.matrix)
-        assert_equal(type(kron(a, m)), np.matrix)
-        assert_equal(type(kron(m, a)), np.matrix)
-
         class myarray(np.ndarray):
             __array_priority__ = 0.0
 
+        a = np.ones([2, 2])
         ma = myarray(a.shape, a.dtype, a.data)
         assert_equal(type(kron(a, a)), np.ndarray)
         assert_equal(type(kron(ma, ma)), myarray)
diff --git a/numpy/lib/tests/test_ufunclike.py b/numpy/lib/tests/test_ufunclike.py
index ad006fe17..5604b3744 100644
--- a/numpy/lib/tests/test_ufunclike.py
+++ b/numpy/lib/tests/test_ufunclike.py
@@ -55,6 +55,10 @@ class TestUfunclike(object):
                 obj.metadata = self.metadata
                 return obj
 
+            def __array_finalize__(self, obj):
+                self.metadata = getattr(obj, 'metadata', None)
+                return self
+
         a = nx.array([1.1, -1.1])
         m = MyArray(a, metadata='foo')
         f = ufl.fix(m)
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 402c18850..cca316e9a 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -650,7 +650,7 @@ def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
         N = 1
 
     if N != 1 and N != 2:
-        xedges = yedges = asarray(bins, float)
+        xedges = yedges = asarray(bins)
         bins = [xedges, yedges]
     hist, edges = histogramdd([x, y], bins, range, normed, weights)
     return hist, edges[0], edges[1]
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 5ee230f92..98af0733b 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -16,20 +16,20 @@ __all__ = ['matrix_power', 'solve', 'tensorsolve', 'tensorinv', 'inv',
            'svd', 'eig', 'eigh', 'lstsq', 'norm', 'qr', 'cond', 'matrix_rank',
            'LinAlgError', 'multi_dot']
 
+import operator
 import warnings
 
 from numpy.core import (
     array, asarray, zeros, empty, empty_like, intc, single, double,
-    csingle, cdouble, inexact, complexfloating, newaxis, ravel, all, Inf, dot,
-    add, multiply, sqrt, maximum, fastCopyAndTranspose, sum, isfinite, size,
-    finfo, errstate, geterrobj, longdouble, moveaxis, amin, amax, product, abs,
-    broadcast, atleast_2d, intp, asanyarray, object_, ones, matmul,
-    swapaxes, divide, count_nonzero, ndarray, isnan
+    csingle, cdouble, inexact, complexfloating, newaxis, all, Inf, dot,
+    add, multiply, sqrt, fastCopyAndTranspose, sum, isfinite,
+    finfo, errstate, geterrobj, moveaxis, amin, amax, product, abs,
+    atleast_2d, intp, asanyarray, object_, matmul,
+    swapaxes, divide, count_nonzero, isnan
 )
 from numpy.core.multiarray import normalize_axis_index
-from numpy.lib import triu, asfarray
+from numpy.lib.twodim_base import triu, eye
 from numpy.linalg import lapack_lite, _umath_linalg
-from numpy.matrixlib.defmatrix import matrix_power
 
 # For Python2/3 compatibility
 _N = b'N'
@@ -210,7 +210,8 @@ def _assertSquareness(*arrays):
 
 def _assertNdSquareness(*arrays):
     for a in arrays:
-        if max(a.shape[-2:]) != min(a.shape[-2:]):
+        m, n = a.shape[-2:]
+        if m != n:
             raise LinAlgError('Last 2 dimensions of the array must be square')
 
 def _assertFinite(*arrays):
@@ -532,6 +533,109 @@ def inv(a):
     return wrap(ainv.astype(result_t, copy=False))
 
 
+def matrix_power(a, n):
+    """
+    Raise a square matrix to the (integer) power `n`.
+
+    For positive integers `n`, the power is computed by repeated matrix
+    squarings and matrix multiplications. If ``n == 0``, the identity matrix
+    of the same shape as M is returned. If ``n < 0``, the inverse
+    is computed and then raised to the ``abs(n)``.
+
+    Parameters
+    ----------
+    a : (..., M, M) array_like
+        Matrix to be "powered."
+    n : int
+        The exponent can be any integer or long integer, positive,
+        negative, or zero.
+
+    Returns
+    -------
+    a**n : (..., M, M) ndarray or matrix object
+        The return value is the same shape and type as `M`;
+        if the exponent is positive or zero then the type of the
+        elements is the same as those of `M`. If the exponent is
+        negative the elements are floating-point.
+
+    Raises
+    ------
+    LinAlgError
+        For matrices that are not square or that (for negative powers) cannot
+        be inverted numerically.
+
+    Examples
+    --------
+    >>> from numpy.linalg import matrix_power
+    >>> i = np.array([[0, 1], [-1, 0]]) # matrix equiv. of the imaginary unit
+    >>> matrix_power(i, 3) # should = -i
+    array([[ 0, -1],
+           [ 1,  0]])
+    >>> matrix_power(i, 0)
+    array([[1, 0],
+           [0, 1]])
+    >>> matrix_power(i, -3) # should = 1/(-i) = i, but w/ f.p. elements
+    array([[ 0.,  1.],
+           [-1.,  0.]])
+
+    Somewhat more sophisticated example
+
+    >>> q = np.zeros((4, 4))
+    >>> q[0:2, 0:2] = -i
+    >>> q[2:4, 2:4] = i
+    >>> q # one of the three quaternion units not equal to 1
+    array([[ 0., -1.,  0.,  0.],
+           [ 1.,  0.,  0.,  0.],
+           [ 0.,  0.,  0.,  1.],
+           [ 0.,  0., -1.,  0.]])
+    >>> matrix_power(q, 2) # = -np.eye(4)
+    array([[-1.,  0.,  0.,  0.],
+           [ 0., -1.,  0.,  0.],
+           [ 0.,  0., -1.,  0.],
+           [ 0.,  0.,  0., -1.]])
+
+    """
+    a = asanyarray(a)
+    _assertRankAtLeast2(a)
+    _assertNdSquareness(a)
+
+    try:
+        n = operator.index(n)
+    except TypeError:
+        raise TypeError("exponent must be an integer")
+
+    if n == 0:
+        a = empty_like(a)
+        a[...] = eye(a.shape[-2], dtype=a.dtype)
+        return a
+
+    elif n < 0:
+        a = inv(a)
+        n = abs(n)
+
+    # short-cuts.
+    if n == 1:
+        return a
+
+    elif n == 2:
+        return matmul(a, a)
+
+    elif n == 3:
+        return matmul(matmul(a, a), a)
+
+    # Use binary decomposition to reduce the number of matrix multiplications.
+    # Here, we iterate over the bits of n, from LSB to MSB, raise `a` to
+    # increasing powers of 2, and multiply into the result as needed.
+    z = result = None
+    while n > 0:
+        z = a if z is None else matmul(z, z)
+        n, bit = divmod(n, 2)
+        if bit:
+            result = z if result is None else matmul(result, z)
+
+    return result
+
+
 # Cholesky decomposition
 
 def cholesky(a):
@@ -1429,8 +1533,7 @@ def svd(a, full_matrices=True, compute_uv=True):
 
     extobj = get_linalg_error_extobj(_raise_linalgerror_svd_nonconvergence)
 
-    m = a.shape[-2]
-    n = a.shape[-1]
+    m, n = a.shape[-2:]
     if compute_uv:
         if full_matrices:
             if m < n:
@@ -1750,7 +1853,8 @@ def pinv(a, rcond=1e-15 ):
     a, wrap = _makearray(a)
     rcond = asarray(rcond)
     if _isEmpty2d(a):
-        res = empty(a.shape[:-2] + (a.shape[-1], a.shape[-2]), dtype=a.dtype)
+        m, n = a.shape[-2:]
+        res = empty(a.shape[:-2] + (n, m), dtype=a.dtype)
         return wrap(res)
     a = a.conjugate()
     u, s, vt = svd(a, full_matrices=False)
@@ -2007,10 +2111,9 @@ def lstsq(a, b, rcond="warn"):
         b = b[:, newaxis]
     _assertRank2(a, b)
     _assertNoEmpty2d(a, b)  # TODO: relax this constraint
-    m  = a.shape[0]
-    n  = a.shape[1]
-    n_rhs = b.shape[1]
-    if m != b.shape[0]:
+    m, n = a.shape[-2:]
+    m2, n_rhs = b.shape[-2:]
+    if m != m2:
         raise LinAlgError('Incompatible dimensions')
 
     t, result_t = _commonType(a, b)
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 4a87330c7..87dfe988a 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -7,11 +7,12 @@ import os
 import sys
 import itertools
 import traceback
-import warnings
+import textwrap
+import subprocess
 import pytest
 
 import numpy as np
-from numpy import array, single, double, csingle, cdouble, dot, identity
+from numpy import array, single, double, csingle, cdouble, dot, identity, matmul
 from numpy import multiply, atleast_2d, inf, asarray, matrix
 from numpy import linalg
 from numpy.linalg import matrix_power, norm, matrix_rank, multi_dot, LinAlgError
@@ -22,12 +23,11 @@ from numpy.testing import (
     )
 
 
-def ifthen(a, b):
-    return not a or b
-
-
-def imply(a, b):
-    return not a or b
+def consistent_subclass(out, in_):
+    # For ndarray subclass input, our output should have the same subclass
+    # (non-ndarray input gets converted to ndarray).
+    return type(out) is (type(in_) if isinstance(in_, np.ndarray)
+                         else np.ndarray)
 
 
 old_assert_almost_equal = assert_almost_equal
@@ -65,6 +65,7 @@ all_tags = {
   'generalized', 'size-0', 'strided' # optional additions
 }
 
+
 class LinalgCase(object):
     def __init__(self, name, a, b, tags=set()):
         """
@@ -86,6 +87,7 @@ class LinalgCase(object):
     def __repr__(self):
         return "<LinalgCase: %s>" % (self.name,)
 
+
 def apply_tag(tag, cases):
     """
     Add the given tag (a string) to each of the cases (a list of LinalgCase
@@ -129,10 +131,6 @@ CASES += apply_tag('square', [
                np.empty((0, 0), dtype=double),
                np.empty((0,), dtype=double),
                tags={'size-0'}),
-    LinalgCase("0x0_matrix",
-               np.empty((0, 0), dtype=double).view(np.matrix),
-               np.empty((0, 1), dtype=double).view(np.matrix),
-               tags={'size-0'}),
     LinalgCase("8x8",
                np.random.rand(8, 8),
                np.random.rand(8)),
@@ -142,12 +140,6 @@ CASES += apply_tag('square', [
     LinalgCase("nonarray",
                [[1, 2], [3, 4]],
                [2, 1]),
-    LinalgCase("matrix_b_only",
-               array([[1., 2.], [3., 4.]]),
-               matrix([2., 1.]).T),
-    LinalgCase("matrix_a_and_b",
-               matrix([[1., 2.], [3., 4.]]),
-               matrix([2., 1.]).T),
 ])
 
 # non-square test-cases
@@ -231,9 +223,6 @@ CASES += apply_tag('hermitian', [
     LinalgCase("matrix_b_only",
                array([[1., 2.], [2., 1.]]),
                None),
-    LinalgCase("hmatrix_a_and_b",
-               matrix([[1., 2.], [2., 1.]]),
-               None),
     LinalgCase("hmatrix_1x1",
                np.random.rand(1, 1),
                None),
@@ -270,12 +259,13 @@ def _make_generalized_cases():
 
     return new_cases
 
+
 CASES += _make_generalized_cases()
 
+
 #
 # Generate stride combination variations of the above
 #
-
 def _stride_comb_iter(x):
     """
     Generate cartesian product of strides for all axes
@@ -323,6 +313,7 @@ def _stride_comb_iter(x):
             xi = np.lib.stride_tricks.as_strided(x, strides=s)
             yield xi, "stride_xxx_0_0"
 
+
 def _make_strided_cases():
     new_cases = []
     for case in CASES:
@@ -333,94 +324,104 @@ def _make_strided_cases():
                 new_cases.append(new_case)
     return new_cases
 
+
 CASES += _make_strided_cases()
 
 
 #
 # Test different routines against the above cases
 #
+class LinalgTestCase(object):
+    TEST_CASES = CASES
 
-def _check_cases(func, require=set(), exclude=set()):
-    """
-    Run func on each of the cases with all of the tags in require, and none
-    of the tags in exclude
-    """
-    for case in CASES:
-        # filter by require and exclude
-        if case.tags & require != require:
-            continue
-        if case.tags & exclude:
-            continue
+    def check_cases(self, require=set(), exclude=set()):
+        """
+        Run func on each of the cases with all of the tags in require, and none
+        of the tags in exclude
+        """
+        for case in self.TEST_CASES:
+            # filter by require and exclude
+            if case.tags & require != require:
+                continue
+            if case.tags & exclude:
+                continue
 
-        try:
-            case.check(func)
-        except Exception:
-            msg = "In test case: %r\n\n" % case
-            msg += traceback.format_exc()
-            raise AssertionError(msg)
+            try:
+                case.check(self.do)
+            except Exception:
+                msg = "In test case: %r\n\n" % case
+                msg += traceback.format_exc()
+                raise AssertionError(msg)
 
 
-class LinalgSquareTestCase(object):
+class LinalgSquareTestCase(LinalgTestCase):
 
     def test_sq_cases(self):
-        _check_cases(self.do, require={'square'}, exclude={'generalized', 'size-0'})
+        self.check_cases(require={'square'},
+                         exclude={'generalized', 'size-0'})
 
     def test_empty_sq_cases(self):
-        _check_cases(self.do, require={'square', 'size-0'}, exclude={'generalized'})
+        self.check_cases(require={'square', 'size-0'},
+                         exclude={'generalized'})
 
 
-class LinalgNonsquareTestCase(object):
+class LinalgNonsquareTestCase(LinalgTestCase):
 
     def test_nonsq_cases(self):
-        _check_cases(self.do, require={'nonsquare'}, exclude={'generalized', 'size-0'})
+        self.check_cases(require={'nonsquare'},
+                         exclude={'generalized', 'size-0'})
 
     def test_empty_nonsq_cases(self):
-        _check_cases(self.do, require={'nonsquare', 'size-0'}, exclude={'generalized'})
+        self.check_cases(require={'nonsquare', 'size-0'},
+                         exclude={'generalized'})
 
-class HermitianTestCase(object):
+
+class HermitianTestCase(LinalgTestCase):
 
     def test_herm_cases(self):
-        _check_cases(self.do, require={'hermitian'}, exclude={'generalized', 'size-0'})
+        self.check_cases(require={'hermitian'},
+                         exclude={'generalized', 'size-0'})
 
     def test_empty_herm_cases(self):
-        _check_cases(self.do, require={'hermitian', 'size-0'}, exclude={'generalized'})
+        self.check_cases(require={'hermitian', 'size-0'},
+                         exclude={'generalized'})
 
 
-class LinalgGeneralizedSquareTestCase(object):
+class LinalgGeneralizedSquareTestCase(LinalgTestCase):
 
     @pytest.mark.slow
     def test_generalized_sq_cases(self):
-        _check_cases(self.do, require={'generalized', 'square'}, exclude={'size-0'})
+        self.check_cases(require={'generalized', 'square'},
+                         exclude={'size-0'})
 
     @pytest.mark.slow
     def test_generalized_empty_sq_cases(self):
-        _check_cases(self.do, require={'generalized', 'square', 'size-0'})
+        self.check_cases(require={'generalized', 'square', 'size-0'})
 
 
-class LinalgGeneralizedNonsquareTestCase(object):
+class LinalgGeneralizedNonsquareTestCase(LinalgTestCase):
 
     @pytest.mark.slow
     def test_generalized_nonsq_cases(self):
-        _check_cases(self.do, require={'generalized', 'nonsquare'}, exclude={'size-0'})
+        self.check_cases(require={'generalized', 'nonsquare'},
+                         exclude={'size-0'})
 
     @pytest.mark.slow
     def test_generalized_empty_nonsq_cases(self):
-        _check_cases(self.do, require={'generalized', 'nonsquare', 'size-0'})
+        self.check_cases(require={'generalized', 'nonsquare', 'size-0'})
 
 
-class HermitianGeneralizedTestCase(object):
+class HermitianGeneralizedTestCase(LinalgTestCase):
 
     @pytest.mark.slow
     def test_generalized_herm_cases(self):
-        _check_cases(self.do,
-            require={'generalized', 'hermitian'},
-            exclude={'size-0'})
+        self.check_cases(require={'generalized', 'hermitian'},
+                         exclude={'size-0'})
 
     @pytest.mark.slow
     def test_generalized_empty_herm_cases(self):
-        _check_cases(self.do,
-            require={'generalized', 'hermitian', 'size-0'},
-            exclude={'none'})
+        self.check_cases(require={'generalized', 'hermitian', 'size-0'},
+                         exclude={'none'})
 
 
 def dot_generalized(a, b):
@@ -446,20 +447,21 @@ def identity_like_generalized(a):
     a = asarray(a)
     if a.ndim >= 3:
         r = np.empty(a.shape, dtype=a.dtype)
-        for c in itertools.product(*map(range, a.shape[:-2])):
-            r[c] = identity(a.shape[-2])
+        r[...] = identity(a.shape[-2])
         return r
     else:
         return identity(a.shape[0])
 
 
-class TestSolve(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
-
+class SolveCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+    # kept apart from TestSolve for use for testing with matrices.
     def do(self, a, b, tags):
         x = linalg.solve(a, b)
         assert_almost_equal(b, dot_generalized(a, x))
-        assert_(imply(isinstance(b, matrix), isinstance(x, matrix)))
+        assert_(consistent_subclass(x, b))
+
 
+class TestSolve(SolveCases):
     def test_types(self):
         def check(dtype):
             x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
@@ -519,14 +521,16 @@ class TestSolve(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
         assert_(isinstance(result, ArraySubclass))
 
 
-class TestInv(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+class InvCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
         a_inv = linalg.inv(a)
         assert_almost_equal(dot_generalized(a, a_inv),
                             identity_like_generalized(a))
-        assert_(imply(isinstance(a, matrix), isinstance(a_inv, matrix)))
+        assert_(consistent_subclass(a_inv, a))
 
+
+class TestInv(InvCases):
     def test_types(self):
         def check(dtype):
             x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
@@ -551,13 +555,15 @@ class TestInv(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
         assert_(isinstance(res, ArraySubclass))
 
 
-class TestEigvals(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+class EigvalsCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
         ev = linalg.eigvals(a)
         evalues, evectors = linalg.eig(a)
         assert_almost_equal(ev, evalues)
 
+
+class TestEigvals(EigvalsCases):
     def test_types(self):
         def check(dtype):
             x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
@@ -586,15 +592,17 @@ class TestEigvals(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
         assert_(isinstance(res, np.ndarray))
 
 
-class TestEig(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+class EigCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
         evalues, evectors = linalg.eig(a)
         assert_allclose(dot_generalized(a, evectors),
                         np.asarray(evectors) * np.asarray(evalues)[..., None, :],
                         rtol=get_rtol(evalues.dtype))
-        assert_(imply(isinstance(a, matrix), isinstance(evectors, matrix)))
+        assert_(consistent_subclass(evectors, a))
+
 
+class TestEig(EigCases):
     def test_types(self):
         def check(dtype):
             x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
@@ -633,7 +641,7 @@ class TestEig(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
         assert_(isinstance(a, np.ndarray))
 
 
-class TestSVD(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+class SVDCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
         if 'size-0' in tags:
@@ -644,9 +652,11 @@ class TestSVD(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
         assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :],
                                            np.asarray(vt)),
                         rtol=get_rtol(u.dtype))
-        assert_(imply(isinstance(a, matrix), isinstance(u, matrix)))
-        assert_(imply(isinstance(a, matrix), isinstance(vt, matrix)))
+        assert_(consistent_subclass(u, a))
+        assert_(consistent_subclass(vt, a))
 
+
+class TestSVD(SVDCases):
     def test_types(self):
         def check(dtype):
             x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
@@ -671,7 +681,7 @@ class TestSVD(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
         assert_raises(linalg.LinAlgError, linalg.svd, a)
 
 
-class TestCond(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+class CondCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
     # cond(x, p) for p in (None, 2, -2)
 
     def do(self, a, b, tags):
@@ -716,6 +726,8 @@ class TestCond(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
                     * (abs(cinv)**2).sum(-1).sum(-1)),
             single_decimal=5, double_decimal=11)
 
+
+class TestCond(CondCases):
     def test_basic_nonsvd(self):
         # Smoketest the non-svd norms
         A = array([[1., 0, 1], [0, -2., 0], [0, 0, 3.]])
@@ -779,20 +791,24 @@ class TestCond(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
             assert_(np.isfinite(c[1,0]))
 
 
-class TestPinv(LinalgSquareTestCase,
-               LinalgNonsquareTestCase,
-               LinalgGeneralizedSquareTestCase,
-               LinalgGeneralizedNonsquareTestCase):
+class PinvCases(LinalgSquareTestCase,
+                LinalgNonsquareTestCase,
+                LinalgGeneralizedSquareTestCase,
+                LinalgGeneralizedNonsquareTestCase):
 
     def do(self, a, b, tags):
         a_ginv = linalg.pinv(a)
         # `a @ a_ginv == I` does not hold if a is singular
         dot = dot_generalized
         assert_almost_equal(dot(dot(a, a_ginv), a), a, single_decimal=5, double_decimal=11)
-        assert_(imply(isinstance(a, matrix), isinstance(a_ginv, matrix)))
+        assert_(consistent_subclass(a_ginv, a))
+
+
+class TestPinv(PinvCases):
+    pass
 
 
-class TestDet(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+class DetCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
         d = linalg.det(a)
@@ -811,6 +827,8 @@ class TestDet(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
         assert_almost_equal(np.abs(s[m]), 1)
         assert_equal(ld[~m], -inf)
 
+
+class TestDet(DetCases):
     def test_zero(self):
         assert_equal(linalg.det([[0.0]]), 0.0)
         assert_equal(type(linalg.det([[0.0]])), double)
@@ -854,7 +872,7 @@ class TestDet(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
         assert_(res[1].dtype.type is np.float64)
 
 
-class TestLstsq(LinalgSquareTestCase, LinalgNonsquareTestCase):
+class LstsqCases(LinalgSquareTestCase, LinalgNonsquareTestCase):
 
     def do(self, a, b, tags):
         if 'size-0' in tags:
@@ -882,9 +900,11 @@ class TestLstsq(LinalgSquareTestCase, LinalgNonsquareTestCase):
             expect_resids = np.array([]).view(type(x))
         assert_almost_equal(residuals, expect_resids)
         assert_(np.issubdtype(residuals.dtype, np.floating))
-        assert_(imply(isinstance(b, matrix), isinstance(x, matrix)))
-        assert_(imply(isinstance(b, matrix), isinstance(residuals, matrix)))
+        assert_(consistent_subclass(x, b))
+        assert_(consistent_subclass(residuals, b))
+
 
+class TestLstsq(LstsqCases):
     def test_future_rcond(self):
         a = np.array([[0., 1.,  0.,  1.,  2.,  0.],
                       [0., 2.,  0.,  0.,  1.,  0.],
@@ -903,20 +923,26 @@ class TestLstsq(LinalgSquareTestCase, LinalgNonsquareTestCase):
             # Warning should be raised exactly once (first command)
             assert_(len(w) == 1)
 
+
 class TestMatrixPower(object):
     R90 = array([[0, 1], [-1, 0]])
     Arb22 = array([[4, -7], [-2, 10]])
     noninv = array([[1, 0], [0, 0]])
-    arbfloat = array([[0.1, 3.2], [1.2, 0.7]])
+    arbfloat = array([[[0.1, 3.2], [1.2, 0.7]],
+                      [[0.2, 6.4], [2.4, 1.4]]])
 
     large = identity(10)
     t = large[1, :].copy()
-    large[1, :] = large[0,:]
+    large[1, :] = large[0, :]
     large[0, :] = t
 
     def test_large_power(self):
         assert_equal(
             matrix_power(self.R90, 2 ** 100 + 2 ** 10 + 2 ** 5 + 1), self.R90)
+        assert_equal(
+            matrix_power(self.R90, 2 ** 100 + 2 ** 10 + 1), self.R90)
+        assert_equal(
+            matrix_power(self.R90, 2 ** 100 + 2 + 1), -self.R90)
 
     def test_large_power_trailing_zero(self):
         assert_equal(
@@ -925,7 +951,7 @@ class TestMatrixPower(object):
     def testip_zero(self):
         def tz(M):
             mz = matrix_power(M, 0)
-            assert_equal(mz, identity(M.shape[0]))
+            assert_equal(mz, identity_like_generalized(M))
             assert_equal(mz.dtype, M.dtype)
         for M in [self.Arb22, self.arbfloat, self.large]:
             tz(M)
@@ -941,7 +967,7 @@ class TestMatrixPower(object):
     def testip_two(self):
         def tz(M):
             mz = matrix_power(M, 2)
-            assert_equal(mz, dot(M, M))
+            assert_equal(mz, matmul(M, M))
             assert_equal(mz.dtype, M.dtype)
         for M in [self.Arb22, self.arbfloat, self.large]:
             tz(M)
@@ -949,14 +975,19 @@ class TestMatrixPower(object):
     def testip_invert(self):
         def tz(M):
             mz = matrix_power(M, -1)
-            assert_almost_equal(identity(M.shape[0]), dot(mz, M))
+            assert_almost_equal(matmul(mz, M), identity_like_generalized(M))
         for M in [self.R90, self.Arb22, self.arbfloat, self.large]:
             tz(M)
 
     def test_invert_noninvertible(self):
-        import numpy.linalg
-        assert_raises(numpy.linalg.linalg.LinAlgError,
-                      lambda: matrix_power(self.noninv, -1))
+        assert_raises(LinAlgError, matrix_power, self.noninv, -1)
+
+    def test_invalid(self):
+        assert_raises(TypeError, matrix_power, self.R90, 1.5)
+        assert_raises(TypeError, matrix_power, self.R90, [1])
+        assert_raises(LinAlgError, matrix_power, np.array([1]), 1)
+        assert_raises(LinAlgError, matrix_power, np.array([[1], [2]]), 1)
+        assert_raises(LinAlgError, matrix_power, np.ones((4, 3, 2)), 1)
 
 
 class TestBoolPower(object):
@@ -966,7 +997,7 @@ class TestBoolPower(object):
         assert_equal(matrix_power(A, 2), A)
 
 
-class TestEigvalsh(HermitianTestCase, HermitianGeneralizedTestCase):
+class TestEigvalshCases(HermitianTestCase, HermitianGeneralizedTestCase):
 
     def do(self, a, b, tags):
         # note that eigenvalue arrays returned by eig must be sorted since
@@ -979,6 +1010,8 @@ class TestEigvalsh(HermitianTestCase, HermitianGeneralizedTestCase):
         ev2 = linalg.eigvalsh(a, 'U')
         assert_allclose(ev2, evalues, rtol=get_rtol(ev.dtype))
 
+
+class TestEigvalsh(object):
     def test_types(self):
         def check(dtype):
             x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
@@ -1034,7 +1067,7 @@ class TestEigvalsh(HermitianTestCase, HermitianGeneralizedTestCase):
         assert_(isinstance(res, np.ndarray))
 
 
-class TestEigh(HermitianTestCase, HermitianGeneralizedTestCase):
+class TestEighCases(HermitianTestCase, HermitianGeneralizedTestCase):
 
     def do(self, a, b, tags):
         # note that eigenvalue arrays returned by eig must be sorted since
@@ -1055,6 +1088,8 @@ class TestEigh(HermitianTestCase, HermitianGeneralizedTestCase):
                         np.asarray(ev2)[..., None, :] * np.asarray(evc2),
                         rtol=get_rtol(ev.dtype), err_msg=repr(a))
 
+
+class TestEigh(object):
     def test_types(self):
         def check(dtype):
             x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
@@ -1115,11 +1150,13 @@ class TestEigh(HermitianTestCase, HermitianGeneralizedTestCase):
         assert_(isinstance(a, np.ndarray))
 
 
-class _TestNorm(object):
-
+class _TestNormBase(object):
     dt = None
     dec = None
 
+
+class _TestNormGeneral(_TestNormBase):
+
     def test_empty(self):
         assert_equal(norm([]), 0.0)
         assert_equal(norm(array([], dtype=self.dt)), 0.0)
@@ -1166,57 +1203,6 @@ class _TestNorm(object):
             assert_(issubclass(an.dtype.type, np.floating))
             assert_almost_equal(an, 1.0)
 
-    def test_matrix_return_type(self):
-        a = np.array([[1, 0, 1], [0, 1, 1]])
-
-        exact_types = np.typecodes['AllInteger']
-
-        # float32, complex64, float64, complex128 types are the only types
-        # allowed by `linalg`, which performs the matrix operations used
-        # within `norm`.
-        inexact_types = 'fdFD'
-
-        all_types = exact_types + inexact_types
-
-        for each_inexact_types in all_types:
-            at = a.astype(each_inexact_types)
-
-            an = norm(at, -np.inf)
-            assert_(issubclass(an.dtype.type, np.floating))
-            assert_almost_equal(an, 2.0)
-
-            with suppress_warnings() as sup:
-                sup.filter(RuntimeWarning, "divide by zero encountered")
-                an = norm(at, -1)
-                assert_(issubclass(an.dtype.type, np.floating))
-                assert_almost_equal(an, 1.0)
-
-            an = norm(at, 1)
-            assert_(issubclass(an.dtype.type, np.floating))
-            assert_almost_equal(an, 2.0)
-
-            an = norm(at, 2)
-            assert_(issubclass(an.dtype.type, np.floating))
-            assert_almost_equal(an, 3.0**(1.0/2.0))
-
-            an = norm(at, -2)
-            assert_(issubclass(an.dtype.type, np.floating))
-            assert_almost_equal(an, 1.0)
-
-            an = norm(at, np.inf)
-            assert_(issubclass(an.dtype.type, np.floating))
-            assert_almost_equal(an, 2.0)
-
-            an = norm(at, 'fro')
-            assert_(issubclass(an.dtype.type, np.floating))
-            assert_almost_equal(an, 2.0)
-
-            an = norm(at, 'nuc')
-            assert_(issubclass(an.dtype.type, np.floating))
-            # Lower bar needed to support low precision floats.
-            # They end up being off by 1 in the 7th place.
-            old_assert_almost_equal(an, 2.7320508075688772, decimal=6)
-
     def test_vector(self):
         a = [1, 2, 3, 4]
         b = [-1, -2, -3, -4]
@@ -1247,39 +1233,6 @@ class _TestNorm(object):
                   array(c, dtype=self.dt)):
             _test(v)
 
-    def test_matrix_2x2(self):
-        A = matrix([[1, 3], [5, 7]], dtype=self.dt)
-        assert_almost_equal(norm(A), 84 ** 0.5)
-        assert_almost_equal(norm(A, 'fro'), 84 ** 0.5)
-        assert_almost_equal(norm(A, 'nuc'), 10.0)
-        assert_almost_equal(norm(A, inf), 12.0)
-        assert_almost_equal(norm(A, -inf), 4.0)
-        assert_almost_equal(norm(A, 1), 10.0)
-        assert_almost_equal(norm(A, -1), 6.0)
-        assert_almost_equal(norm(A, 2), 9.1231056256176615)
-        assert_almost_equal(norm(A, -2), 0.87689437438234041)
-
-        assert_raises(ValueError, norm, A, 'nofro')
-        assert_raises(ValueError, norm, A, -3)
-        assert_raises(ValueError, norm, A, 0)
-
-    def test_matrix_3x3(self):
-        # This test has been added because the 2x2 example
-        # happened to have equal nuclear norm and induced 1-norm.
-        # The 1/10 scaling factor accommodates the absolute tolerance
-        # used in assert_almost_equal.
-        A = (1 / 10) * \
-            np.array([[1, 2, 3], [6, 0, 5], [3, 2, 1]], dtype=self.dt)
-        assert_almost_equal(norm(A), (1 / 10) * 89 ** 0.5)
-        assert_almost_equal(norm(A, 'fro'), (1 / 10) * 89 ** 0.5)
-        assert_almost_equal(norm(A, 'nuc'), 1.3366836911774836)
-        assert_almost_equal(norm(A, inf), 1.1)
-        assert_almost_equal(norm(A, -inf), 0.6)
-        assert_almost_equal(norm(A, 1), 1.0)
-        assert_almost_equal(norm(A, -1), 0.4)
-        assert_almost_equal(norm(A, 2), 0.88722940323461277)
-        assert_almost_equal(norm(A, -2), 0.19456584790481812)
-
     def test_axis(self):
         # Vector norms.
         # Compare the use of `axis` with computing the norm of each row
@@ -1359,10 +1312,103 @@ class _TestNorm(object):
                 assert_(found.shape == expected_shape,
                         shape_err.format(found.shape, expected_shape, order, k))
 
+
+class _TestNorm2D(_TestNormBase):
+    # Define the part for 2d arrays separately, so we can subclass this
+    # and run the tests using np.matrix in matrixlib.tests.test_matrix_linalg.
+    array = np.array
+
+    def test_matrix_empty(self):
+        assert_equal(norm(self.array([[]], dtype=self.dt)), 0.0)
+
+    def test_matrix_return_type(self):
+        a = self.array([[1, 0, 1], [0, 1, 1]])
+
+        exact_types = np.typecodes['AllInteger']
+
+        # float32, complex64, float64, complex128 types are the only types
+        # allowed by `linalg`, which performs the matrix operations used
+        # within `norm`.
+        inexact_types = 'fdFD'
+
+        all_types = exact_types + inexact_types
+
+        for each_inexact_types in all_types:
+            at = a.astype(each_inexact_types)
+
+            an = norm(at, -np.inf)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2.0)
+
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning, "divide by zero encountered")
+                an = norm(at, -1)
+                assert_(issubclass(an.dtype.type, np.floating))
+                assert_almost_equal(an, 1.0)
+
+            an = norm(at, 1)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 2)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 3.0**(1.0/2.0))
+
+            an = norm(at, -2)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 1.0)
+
+            an = norm(at, np.inf)
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 'fro')
+            assert_(issubclass(an.dtype.type, np.floating))
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 'nuc')
+            assert_(issubclass(an.dtype.type, np.floating))
+            # Lower bar needed to support low precision floats.
+            # They end up being off by 1 in the 7th place.
+            np.testing.assert_almost_equal(an, 2.7320508075688772, decimal=6)
+
+    def test_matrix_2x2(self):
+        A = self.array([[1, 3], [5, 7]], dtype=self.dt)
+        assert_almost_equal(norm(A), 84 ** 0.5)
+        assert_almost_equal(norm(A, 'fro'), 84 ** 0.5)
+        assert_almost_equal(norm(A, 'nuc'), 10.0)
+        assert_almost_equal(norm(A, inf), 12.0)
+        assert_almost_equal(norm(A, -inf), 4.0)
+        assert_almost_equal(norm(A, 1), 10.0)
+        assert_almost_equal(norm(A, -1), 6.0)
+        assert_almost_equal(norm(A, 2), 9.1231056256176615)
+        assert_almost_equal(norm(A, -2), 0.87689437438234041)
+
+        assert_raises(ValueError, norm, A, 'nofro')
+        assert_raises(ValueError, norm, A, -3)
+        assert_raises(ValueError, norm, A, 0)
+
+    def test_matrix_3x3(self):
+        # This test has been added because the 2x2 example
+        # happened to have equal nuclear norm and induced 1-norm.
+        # The 1/10 scaling factor accommodates the absolute tolerance
+        # used in assert_almost_equal.
+        A = (1 / 10) * \
+            self.array([[1, 2, 3], [6, 0, 5], [3, 2, 1]], dtype=self.dt)
+        assert_almost_equal(norm(A), (1 / 10) * 89 ** 0.5)
+        assert_almost_equal(norm(A, 'fro'), (1 / 10) * 89 ** 0.5)
+        assert_almost_equal(norm(A, 'nuc'), 1.3366836911774836)
+        assert_almost_equal(norm(A, inf), 1.1)
+        assert_almost_equal(norm(A, -inf), 0.6)
+        assert_almost_equal(norm(A, 1), 1.0)
+        assert_almost_equal(norm(A, -1), 0.4)
+        assert_almost_equal(norm(A, 2), 0.88722940323461277)
+        assert_almost_equal(norm(A, -2), 0.19456584790481812)
+
     def test_bad_args(self):
         # Check that bad arguments raise the appropriate exceptions.
 
-        A = array([[1, 2, 3], [4, 5, 6]], dtype=self.dt)
+        A = self.array([[1, 2, 3], [4, 5, 6]], dtype=self.dt)
         B = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4)
 
         # Using `axis=<integer>` or passing in a 1-D array implies vector
@@ -1386,6 +1432,10 @@ class _TestNorm(object):
         assert_raises(ValueError, norm, B, None, (0, 1, 2))
 
 
+class _TestNorm(_TestNorm2D, _TestNormGeneral):
+    pass
+
+
 class TestNorm_NonSystematic(object):
 
     def test_longdouble_norm(self):
@@ -1413,21 +1463,34 @@ class TestNorm_NonSystematic(object):
         old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=5)
 
 
-class TestNormDouble(_TestNorm):
+# Separate definitions so we can use them for matrix tests.
+class _TestNormDoubleBase(_TestNormBase):
     dt = np.double
     dec = 12
 
 
-class TestNormSingle(_TestNorm):
+class _TestNormSingleBase(_TestNormBase):
     dt = np.float32
     dec = 6
 
 
-class TestNormInt64(_TestNorm):
+class _TestNormInt64Base(_TestNormBase):
     dt = np.int64
     dec = 12
 
 
+class TestNormDouble(_TestNorm, _TestNormDoubleBase):
+    pass
+
+
+class TestNormSingle(_TestNorm, _TestNormSingleBase):
+    pass
+
+
+class TestNormInt64(_TestNorm, _TestNormInt64Base):
+    pass
+
+
 class TestMatrixRank(object):
 
     def test_matrix_rank(self):
@@ -1478,6 +1541,8 @@ def test_reduced_rank():
 
 
 class TestQR(object):
+    # Define the array class here, so run this on matrices elsewhere.
+    array = np.array
 
     def check_qr(self, a):
         # This test expects the argument `a` to be an ndarray or
@@ -1528,7 +1593,7 @@ class TestQR(object):
         # of the functions in lapack_lite. Consequently, this test is
         # very limited in scope. Note that the results are in FORTRAN
         # order, hence the h arrays are transposed.
-        a = array([[1, 2], [3, 4], [5, 6]], dtype=np.double)
+        a = self.array([[1, 2], [3, 4], [5, 6]], dtype=np.double)
 
         # Test double
         h, tau = linalg.qr(a, mode='raw')
@@ -1544,22 +1609,21 @@ class TestQR(object):
         assert_(tau.shape == (2,))
 
     def test_mode_all_but_economic(self):
-        a = array([[1, 2], [3, 4]])
-        b = array([[1, 2], [3, 4], [5, 6]])
+        a = self.array([[1, 2], [3, 4]])
+        b = self.array([[1, 2], [3, 4], [5, 6]])
         for dt in "fd":
             m1 = a.astype(dt)
             m2 = b.astype(dt)
             self.check_qr(m1)
             self.check_qr(m2)
             self.check_qr(m2.T)
-            self.check_qr(matrix(m1))
+
         for dt in "fd":
             m1 = 1 + 1j * a.astype(dt)
             m2 = 1 + 1j * b.astype(dt)
             self.check_qr(m1)
             self.check_qr(m2)
             self.check_qr(m2.T)
-            self.check_qr(matrix(m1))
 
     def test_0_size(self):
         # There may be good ways to do (some of this) reasonably:
@@ -1699,6 +1763,40 @@ def test_xerbla_override():
             raise SkipTest('Numpy xerbla not linked in.')
 
 
+def test_sdot_bug_8577():
+    # Regression test that loading certain other libraries does not
+    # result to wrong results in float32 linear algebra.
+    #
+    # There's a bug gh-8577 on OSX that can trigger this, and perhaps
+    # there are also other situations in which it occurs.
+    #
+    # Do the check in a separate process.
+
+    bad_libs = ['PyQt5.QtWidgets', 'IPython']
+
+    template = textwrap.dedent("""
+    import sys
+    {before}
+    try:
+        import {bad_lib}
+    except ImportError:
+        sys.exit(0)
+    {after}
+    x = np.ones(2, dtype=np.float32)
+    sys.exit(0 if np.allclose(x.dot(x), 2.0) else 1)
+    """)
+
+    for bad_lib in bad_libs:
+        code = template.format(before="import numpy as np", after="",
+                               bad_lib=bad_lib)
+        subprocess.check_call([sys.executable, "-c", code])
+
+        # Swapped import order
+        code = template.format(after="import numpy as np", before="",
+                               bad_lib=bad_lib)
+        subprocess.check_call([sys.executable, "-c", code])
+
+
 class TestMultiDot(object):
 
     def test_basic_function_with_three_arguments(self):
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 03fdd387a..7dc1cb0cb 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -382,17 +382,11 @@ typedef f2c_doublecomplex fortran_doublecomplex;
  *****************************************************************************
  */
 
-static NPY_INLINE void *
-offset_ptr(void* ptr, ptrdiff_t offset)
-{
-    return (void*)((npy_uint8*)ptr + offset);
-}
-
 static NPY_INLINE int
 get_fp_invalid_and_clear(void)
 {
     int status;
-    status = npy_clear_floatstatus();
+    status = npy_clear_floatstatus_barrier((char*)&status);
     return !!(status & NPY_FPE_INVALID);
 }
 
@@ -403,7 +397,7 @@ set_fp_invalid_or_clear(int error_occurred)
         npy_set_floatstatus_invalid();
     }
     else {
-        npy_clear_floatstatus();
+        npy_clear_floatstatus_barrier((char*)&error_occurred);
     }
 }
 
@@ -577,104 +571,6 @@ dump_linearize_data(const char* name, const LINEARIZE_DATA_t* params)
               params->row_strides, params->column_strides);
 }
 
-
-static NPY_INLINE float
-FLOAT_add(float op1, float op2)
-{
-    return op1 + op2;
-}
-
-static NPY_INLINE double
-DOUBLE_add(double op1, double op2)
-{
-    return op1 + op2;
-}
-
-static NPY_INLINE COMPLEX_t
-CFLOAT_add(COMPLEX_t op1, COMPLEX_t op2)
-{
-    COMPLEX_t result;
-    result.array[0] = op1.array[0] + op2.array[0];
-    result.array[1] = op1.array[1] + op2.array[1];
-
-    return result;
-}
-
-static NPY_INLINE DOUBLECOMPLEX_t
-CDOUBLE_add(DOUBLECOMPLEX_t op1, DOUBLECOMPLEX_t op2)
-{
-    DOUBLECOMPLEX_t result;
-    result.array[0] = op1.array[0] + op2.array[0];
-    result.array[1] = op1.array[1] + op2.array[1];
-
-    return result;
-}
-
-static NPY_INLINE float
-FLOAT_mul(float op1, float op2)
-{
-    return op1*op2;
-}
-
-static NPY_INLINE double
-DOUBLE_mul(double op1, double op2)
-{
-    return op1*op2;
-}
-
-
-static NPY_INLINE COMPLEX_t
-CFLOAT_mul(COMPLEX_t op1, COMPLEX_t op2)
-{
-    COMPLEX_t result;
-    result.array[0] = op1.array[0]*op2.array[0] - op1.array[1]*op2.array[1];
-    result.array[1] = op1.array[1]*op2.array[0] + op1.array[0]*op2.array[1];
-
-    return result;
-}
-
-static NPY_INLINE DOUBLECOMPLEX_t
-CDOUBLE_mul(DOUBLECOMPLEX_t op1, DOUBLECOMPLEX_t op2)
-{
-    DOUBLECOMPLEX_t result;
-    result.array[0] = op1.array[0]*op2.array[0] - op1.array[1]*op2.array[1];
-    result.array[1] = op1.array[1]*op2.array[0] + op1.array[0]*op2.array[1];
-
-    return result;
-}
-
-static NPY_INLINE float
-FLOAT_mulc(float op1, float op2)
-{
-    return op1*op2;
-}
-
-static NPY_INLINE double
-DOUBLE_mulc(float op1, float op2)
-{
-    return op1*op2;
-}
-
-static NPY_INLINE COMPLEX_t
-CFLOAT_mulc(COMPLEX_t op1, COMPLEX_t op2)
-{
-    COMPLEX_t result;
-    result.array[0] = op1.array[0]*op2.array[0] + op1.array[1]*op2.array[1];
-    result.array[1] = op1.array[0]*op2.array[1] - op1.array[1]*op2.array[0];
-
-    return result;
-}
-
-static NPY_INLINE DOUBLECOMPLEX_t
-CDOUBLE_mulc(DOUBLECOMPLEX_t op1, DOUBLECOMPLEX_t op2)
-{
-    DOUBLECOMPLEX_t result;
-    result.array[0] = op1.array[0]*op2.array[0] + op1.array[1]*op2.array[1];
-    result.array[1] = op1.array[0]*op2.array[1] - op1.array[1]*op2.array[0];
-
-    return result;
-}
-
 static NPY_INLINE void
 print_FLOAT(npy_float s)
 {
@@ -3306,7 +3202,7 @@ static void
                     for (i = 0; i < nrhs; i++) {
                         @ftyp@ *vector = components + i*m;
                         /* Numpy and fortran floating types are the same size,
-                         * so this case is safe */
+                         * so this cast is safe */
                         @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
                         memcpy(
                             resid + i*r_out.column_strides,
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index 91cf8ed0f..5ed086db3 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -2799,13 +2799,8 @@ class MaskedArray(ndarray):
             # FIXME _sharedmask is never used.
             _sharedmask = True
         # Process mask.
-        # Number of named fields (or zero if none)
-        names_ = _data.dtype.names or ()
         # Type of the mask
-        if names_:
-            mdtype = make_mask_descr(_data.dtype)
-        else:
-            mdtype = MaskType
+        mdtype = make_mask_descr(_data.dtype)
 
         if mask is nomask:
             # Case 1. : no mask in input.
@@ -2831,14 +2826,12 @@ class MaskedArray(ndarray):
                     _data._mask = mask
                     _data._sharedmask = False
             else:
+                _data._sharedmask = not copy
                 if copy:
                     _data._mask = _data._mask.copy()
-                    _data._sharedmask = False
                     # Reset the shape of the original mask
                     if getmask(data) is not nomask:
                         data._mask.shape = data.shape
-                else:
-                    _data._sharedmask = True
         else:
             # Case 2. : With a mask in input.
             # If mask is boolean, create an array of True or False
@@ -2875,7 +2868,7 @@ class MaskedArray(ndarray):
                     _data._mask = mask
                     _data._sharedmask = not copy
                 else:
-                    if names_:
+                    if _data.dtype.names:
                         def _recursive_or(a, b):
                             "do a|=b on each field of a, recursively"
                             for name in a.dtype.names:
@@ -2884,7 +2877,7 @@ class MaskedArray(ndarray):
                                     _recursive_or(af, bf)
                                 else:
                                     af |= bf
-                            return
+
                         _recursive_or(_data._mask, mask)
                     else:
                         _data._mask = np.logical_or(mask, _data._mask)
@@ -2999,7 +2992,9 @@ class MaskedArray(ndarray):
                     order = "K"
 
                 _mask = _mask.astype(_mask_dtype, order)
-
+            else:
+                # Take a view so shape changes, etc., do not propagate back.
+                _mask = _mask.view()
         else:
             _mask = nomask
 
@@ -3089,7 +3084,7 @@ class MaskedArray(ndarray):
             returned object (this is equivalent to setting the ``type``
             parameter).
         type : Python type, optional
-            Type of the returned view, e.g., ndarray or matrix.  Again, the
+            Type of the returned view, either ndarray or a subclass.  The
             default None results in type preservation.
 
         Notes
@@ -3344,17 +3339,35 @@ class MaskedArray(ndarray):
             _mask[indx] = mindx
         return
 
-    def __setattr__(self, attr, value):
-        super(MaskedArray, self).__setattr__(attr, value)
-        if attr == 'dtype' and self._mask is not nomask:
-            self._mask = self._mask.view(make_mask_descr(value), ndarray)
-            # Try to reset the shape of the mask (if we don't have a void)
-            # This raises a ValueError if the dtype change won't work
+    # Define so that we can overwrite the setter.
+    @property
+    def dtype(self):
+        return super(MaskedArray, self).dtype
+
+    @dtype.setter
+    def dtype(self, dtype):
+        super(MaskedArray, type(self)).dtype.__set__(self, dtype)
+        if self._mask is not nomask:
+            self._mask = self._mask.view(make_mask_descr(dtype), ndarray)
+            # Try to reset the shape of the mask (if we don't have a void).
+            # This raises a ValueError if the dtype change won't work.
             try:
                 self._mask.shape = self.shape
             except (AttributeError, TypeError):
                 pass
 
+    @property
+    def shape(self):
+        return super(MaskedArray, self).shape
+
+    @shape.setter
+    def shape(self, shape):
+        super(MaskedArray, type(self)).shape.__set__(self, shape)
+        # Cannot use self._mask, since it may not (yet) exist when a
+        # masked matrix sets the shape.
+        if getmask(self) is not nomask:
+            self._mask.shape = self.shape
+
     def __setmask__(self, mask, copy=False):
         """
         Set the mask.
@@ -3673,14 +3686,14 @@ class MaskedArray(ndarray):
         >>> type(x.filled())
         <type 'numpy.ndarray'>
 
-        Subclassing is preserved. This means that if the data part of the masked
-        array is a matrix, `filled` returns a matrix:
-
-        >>> x = np.ma.array(np.matrix([[1, 2], [3, 4]]), mask=[[0, 1], [1, 0]])
-        >>> x.filled()
-        matrix([[     1, 999999],
-                [999999,      4]])
+        Subclassing is preserved. This means that if, e.g., the data part of
+        the masked array is a recarray, `filled` returns a recarray:
 
+        >>> x = np.array([(-1, 2), (-3, 4)], dtype='i8,i8').view(np.recarray)
+        >>> m = np.ma.array(x, mask=[(True, False), (False, True)])
+        >>> m.filled()
+        rec.array([(999999,      2), (    -3, 999999)],
+                  dtype=[('f0', '<i8'), ('f1', '<i8')])
         """
         m = self._mask
         if m is nomask:
@@ -5531,15 +5544,7 @@ class MaskedArray(ndarray):
         sidx = self.argsort(axis=axis, kind=kind, order=order,
                             fill_value=fill_value, endwith=endwith)
 
-        # save memory for 1d arrays
-        if self.ndim == 1:
-            idx = sidx
-        else:
-            idx = list(np.ix_(*[np.arange(x) for x in self.shape]))
-            idx[axis] = sidx
-            idx = tuple(idx)
-
-        self[...] = self[idx]
+        self[...] = np.take_along_axis(self, sidx, axis=axis)
 
     def min(self, axis=None, out=None, fill_value=None, keepdims=np._NoValue):
         """
@@ -6317,6 +6322,12 @@ class MaskedConstant(MaskedArray):
         # precedent for this with `np.bool_` scalars.
         return self
 
+    def __copy__(self):
+        return self
+		
+    def __deepcopy__(self, memo):
+        return self
+
     def __setattr__(self, attr, value):
         if not self.__has_singleton():
             # allow the singleton to be initialized
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
index 8272dced9..3be4d3625 100644
--- a/numpy/ma/extras.py
+++ b/numpy/ma/extras.py
@@ -747,19 +747,17 @@ def _median(a, axis=None, out=None, overwrite_input=False):
             return np.ma.minimum_fill_value(asorted)
         return s
 
-    counts = count(asorted, axis=axis)
+    counts = count(asorted, axis=axis, keepdims=True)
     h = counts // 2
 
-    # create indexing mesh grid for all but reduced axis
-    axes_grid = [np.arange(x) for i, x in enumerate(asorted.shape)
-                 if i != axis]
-    ind = np.meshgrid(*axes_grid, sparse=True, indexing='ij')
+    # duplicate high if odd number of elements so mean does nothing
+    odd = counts % 2 == 1
+    l = np.where(odd, h, h-1)
 
-    # insert indices of low and high median
-    ind.insert(axis, h - 1)
-    low = asorted[tuple(ind)]
-    ind[axis] = np.minimum(h, asorted.shape[axis] - 1)
-    high = asorted[tuple(ind)]
+    lh = np.concatenate([l,h], axis=axis)
+
+    # get low and high median
+    low_high = np.take_along_axis(asorted, lh, axis=axis)
 
     def replace_masked(s):
         # Replace masked entries with minimum_full_value unless it all values
@@ -767,30 +765,20 @@ def _median(a, axis=None, out=None, overwrite_input=False):
         # larger than the fill value is undefined and a valid value placed
         # elsewhere, e.g. [4, --, inf].
         if np.ma.is_masked(s):
-            rep = (~np.all(asorted.mask, axis=axis)) & s.mask
+            rep = (~np.all(asorted.mask, axis=axis, keepdims=True)) & s.mask
             s.data[rep] = np.ma.minimum_fill_value(asorted)
             s.mask[rep] = False
 
-    replace_masked(low)
-    replace_masked(high)
-
-    # duplicate high if odd number of elements so mean does nothing
-    odd = counts % 2 == 1
-    np.copyto(low, high, where=odd)
-    # not necessary for scalar True/False masks
-    try:
-        np.copyto(low.mask, high.mask, where=odd)
-    except Exception:
-        pass
+    replace_masked(low_high)
 
     if np.issubdtype(asorted.dtype, np.inexact):
         # avoid inf / x = masked
-        s = np.ma.sum([low, high], axis=0, out=out)
+        s = np.ma.sum(low_high, axis=axis, out=out)
         np.true_divide(s.data, 2., casting='unsafe', out=s.data)
 
         s = np.lib.utils._median_nancheck(asorted, s, axis, out)
     else:
-        s = np.ma.mean([low, high], axis=0, out=out)
+        s = np.ma.mean(low_high, axis=axis, out=out)
 
     return s
 
@@ -1465,9 +1453,14 @@ class MAxisConcatenator(AxisConcatenator):
     """
     concatenate = staticmethod(concatenate)
 
-    @staticmethod
-    def makemat(arr):
-        return array(arr.data.view(np.matrix), mask=arr.mask)
+    @classmethod
+    def makemat(cls, arr):
+        # There used to be a view as np.matrix here, but we may eventually
+        # deprecate that class. In preparation, we use the unmasked version
+        # to construct the matrix (with copy=False for backwards compatibility
+        # with the .view)
+        data = super(MAxisConcatenator, cls).makemat(arr.data, copy=False)
+        return array(data, mask=arr.mask)
 
     def __getitem__(self, key):
         # matrix builder syntax, like 'a, b; c, d'
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index 9caf38b56..51616f214 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -335,49 +335,6 @@ class TestMaskedArray(object):
         assert_equal(s1, s2)
         assert_(x1[1:1].shape == (0,))
 
-    def test_matrix_indexing(self):
-        # Tests conversions and indexing
-        x1 = np.matrix([[1, 2, 3], [4, 3, 2]])
-        x2 = array(x1, mask=[[1, 0, 0], [0, 1, 0]])
-        x3 = array(x1, mask=[[0, 1, 0], [1, 0, 0]])
-        x4 = array(x1)
-        # test conversion to strings
-        str(x2)  # raises?
-        repr(x2)  # raises?
-        # tests of indexing
-        assert_(type(x2[1, 0]) is type(x1[1, 0]))
-        assert_(x1[1, 0] == x2[1, 0])
-        assert_(x2[1, 1] is masked)
-        assert_equal(x1[0, 2], x2[0, 2])
-        assert_equal(x1[0, 1:], x2[0, 1:])
-        assert_equal(x1[:, 2], x2[:, 2])
-        assert_equal(x1[:], x2[:])
-        assert_equal(x1[1:], x3[1:])
-        x1[0, 2] = 9
-        x2[0, 2] = 9
-        assert_equal(x1, x2)
-        x1[0, 1:] = 99
-        x2[0, 1:] = 99
-        assert_equal(x1, x2)
-        x2[0, 1] = masked
-        assert_equal(x1, x2)
-        x2[0, 1:] = masked
-        assert_equal(x1, x2)
-        x2[0, :] = x1[0, :]
-        x2[0, 1] = masked
-        assert_(allequal(getmask(x2), np.array([[0, 1, 0], [0, 1, 0]])))
-        x3[1, :] = masked_array([1, 2, 3], [1, 1, 0])
-        assert_(allequal(getmask(x3)[1], array([1, 1, 0])))
-        assert_(allequal(getmask(x3[1]), array([1, 1, 0])))
-        x4[1, :] = masked_array([1, 2, 3], [1, 1, 0])
-        assert_(allequal(getmask(x4[1]), array([1, 1, 0])))
-        assert_(allequal(x4[1], array([1, 2, 3])))
-        x1 = np.matrix(np.arange(5) * 1.0)
-        x2 = masked_values(x1, 3.0)
-        assert_equal(x1, x2)
-        assert_(allequal(array([0, 0, 0, 1, 0], MaskType), x2.mask))
-        assert_equal(3.0, x2.fill_value)
-
     @suppress_copy_mask_on_assignment
     def test_copy(self):
         # Tests of some subtle points of copying and sizing.
@@ -395,9 +352,11 @@ class TestMaskedArray(object):
         assert_equal(y1._mask.__array_interface__, m.__array_interface__)
 
         y1a = array(y1)
+        # Default for masked array is not to copy; see gh-10318.
         assert_(y1a._data.__array_interface__ ==
                         y1._data.__array_interface__)
-        assert_(y1a.mask is y1.mask)
+        assert_(y1a._mask.__array_interface__ ==
+                        y1._mask.__array_interface__)
 
         y2 = array(x1, mask=m3)
         assert_(y2._data.__array_interface__ == x1.__array_interface__)
@@ -611,11 +570,13 @@ class TestMaskedArray(object):
 
     def test_pickling_subbaseclass(self):
         # Test pickling w/ a subclass of ndarray
-        a = array(np.matrix(list(range(10))), mask=[1, 0, 1, 0, 0] * 2)
+        x = np.array([(1.0, 2), (3.0, 4)],
+                     dtype=[('x', float), ('y', int)]).view(np.recarray)
+        a = masked_array(x, mask=[(True, False), (False, True)])
         a_pickled = pickle.loads(a.dumps())
         assert_equal(a_pickled._mask, a._mask)
         assert_equal(a_pickled, a)
-        assert_(isinstance(a_pickled._data, np.matrix))
+        assert_(isinstance(a_pickled._data, np.recarray))
 
     def test_pickling_maskedconstant(self):
         # Test pickling MaskedConstant
@@ -1448,16 +1409,6 @@ class TestMaskedArrayArithmetic(object):
             assert_(result is output)
             assert_(output[0] is masked)
 
-    def test_count_mean_with_matrix(self):
-        m = np.ma.array(np.matrix([[1,2],[3,4]]), mask=np.zeros((2,2)))
-
-        assert_equal(m.count(axis=0).shape, (1,2))
-        assert_equal(m.count(axis=1).shape, (2,1))
-
-        #make sure broadcasting inside mean and var work
-        assert_equal(m.mean(axis=0), [[2., 3.]])
-        assert_equal(m.mean(axis=1), [[1.5], [3.5]])
-
     def test_eq_on_structured(self):
         # Test the equality of structured arrays
         ndtype = [('A', int), ('B', int)]
@@ -1740,23 +1691,6 @@ class TestMaskedArrayAttributes(object):
 
     def test_flat(self):
         # Test that flat can return all types of items [#4585, #4615]
-        # test simple access
-        test = masked_array(np.matrix([[1, 2, 3]]), mask=[0, 0, 1])
-        assert_equal(test.flat[1], 2)
-        assert_equal(test.flat[2], masked)
-        assert_(np.all(test.flat[0:2] == test[0, 0:2]))
-        # Test flat on masked_matrices
-        test = masked_array(np.matrix([[1, 2, 3]]), mask=[0, 0, 1])
-        test.flat = masked_array([3, 2, 1], mask=[1, 0, 0])
-        control = masked_array(np.matrix([[3, 2, 1]]), mask=[1, 0, 0])
-        assert_equal(test, control)
-        # Test setting
-        test = masked_array(np.matrix([[1, 2, 3]]), mask=[0, 0, 1])
-        testflat = test.flat
-        testflat[:] = testflat[[2, 1, 0]]
-        assert_equal(test, control)
-        testflat[0] = 9
-        assert_equal(test[0, 0], 9)
         # test 2-D record array
         # ... on structured array w/ masked records
         x = array([[(1, 1.1, 'one'), (2, 2.2, 'two'), (3, 3.3, 'thr')],
@@ -1784,12 +1718,6 @@ class TestMaskedArrayAttributes(object):
             if i >= x.shape[-1]:
                 i = 0
                 j += 1
-        # test that matrices keep the correct shape (#4615)
-        a = masked_array(np.matrix(np.eye(2)), mask=0)
-        b = a.flat
-        b01 = b[:2]
-        assert_equal(b01.data, array([[1., 0.]]))
-        assert_equal(b01.mask, array([[False, False]]))
 
     def test_assign_dtype(self):
         # check that the mask's dtype is updated when dtype is changed
@@ -2893,32 +2821,6 @@ class TestMaskedArrayMethods(object):
         assert_equal(mxsmall.any(0), [True, True, False])
         assert_equal(mxsmall.any(1), [True, True, False])
 
-    def test_allany_onmatrices(self):
-        x = np.array([[0.13, 0.26, 0.90],
-                      [0.28, 0.33, 0.63],
-                      [0.31, 0.87, 0.70]])
-        X = np.matrix(x)
-        m = np.array([[True, False, False],
-                      [False, False, False],
-                      [True, True, False]], dtype=np.bool_)
-        mX = masked_array(X, mask=m)
-        mXbig = (mX > 0.5)
-        mXsmall = (mX < 0.5)
-
-        assert_(not mXbig.all())
-        assert_(mXbig.any())
-        assert_equal(mXbig.all(0), np.matrix([False, False, True]))
-        assert_equal(mXbig.all(1), np.matrix([False, False, True]).T)
-        assert_equal(mXbig.any(0), np.matrix([False, False, True]))
-        assert_equal(mXbig.any(1), np.matrix([True, True, True]).T)
-
-        assert_(not mXsmall.all())
-        assert_(mXsmall.any())
-        assert_equal(mXsmall.all(0), np.matrix([True, True, False]))
-        assert_equal(mXsmall.all(1), np.matrix([False, False, False]).T)
-        assert_equal(mXsmall.any(0), np.matrix([True, True, False]))
-        assert_equal(mXsmall.any(1), np.matrix([True, True, False]).T)
-
     def test_allany_oddities(self):
         # Some fun with all and any
         store = empty((), dtype=bool)
@@ -3017,14 +2919,6 @@ class TestMaskedArrayMethods(object):
         b = a.compressed()
         assert_equal(b, [2, 3, 4])
 
-        a = array(np.matrix([1, 2, 3, 4]), mask=[0, 0, 0, 0])
-        b = a.compressed()
-        assert_equal(b, a)
-        assert_(isinstance(b, np.matrix))
-        a[0, 0] = masked
-        b = a.compressed()
-        assert_equal(b, [[2, 3, 4]])
-
     def test_empty(self):
         # Tests empty/like
         datatype = [('a', int), ('b', float), ('c', '|S8')]
@@ -3139,10 +3033,6 @@ class TestMaskedArrayMethods(object):
         a = array([0, 0], mask=[1, 1])
         aravel = a.ravel()
         assert_equal(aravel._mask.shape, a.shape)
-        a = array(np.matrix([1, 2, 3, 4, 5]), mask=[[0, 1, 0, 0, 0]])
-        aravel = a.ravel()
-        assert_equal(aravel.shape, (1, 5))
-        assert_equal(aravel._mask.shape, a.shape)
         # Checks that small_mask is preserved
         a = array([1, 2, 3, 4], mask=[0, 0, 0, 0], shrink=False)
         assert_equal(a.ravel()._mask, [0, 0, 0, 0])
@@ -4607,10 +4497,6 @@ class TestMaskedFields(object):
         assert_equal(test, data)
         assert_equal(test.mask, controlmask.reshape(-1, 2))
 
-        test = a.view((float, 2), np.matrix)
-        assert_equal(test, data)
-        assert_(isinstance(test, np.matrix))
-
     def test_getitem(self):
         ndtype = [('a', float), ('b', float)]
         a = array(list(zip(np.random.rand(10), np.arange(10))), dtype=ndtype)
@@ -4794,11 +4680,12 @@ class TestMaskedView(object):
     def test_view_to_dtype_and_type(self):
         (data, a, controlmask) = self.data
 
-        test = a.view((float, 2), np.matrix)
+        test = a.view((float, 2), np.recarray)
         assert_equal(test, data)
-        assert_(isinstance(test, np.matrix))
+        assert_(isinstance(test, np.recarray))
         assert_(not isinstance(test, MaskedArray))
 
+
 class TestOptionalArgs(object):
     def test_ndarrayfuncs(self):
         # test axis arg behaves the same as ndarray (including multiple axes)
@@ -4941,6 +4828,16 @@ class TestMaskedConstant(object):
             np.ma.masked.copy() is np.ma.masked,
             np.True_.copy() is np.True_)
 
+    def test__copy(self):
+        import copy
+        assert_(
+            copy.copy(np.ma.masked) is np.ma.masked)
+
+    def test_deepcopy(self):
+        import copy
+        assert_(
+            copy.deepcopy(np.ma.masked) is np.ma.masked)
+
     def test_immutable(self):
         orig = np.ma.masked
         assert_raises(np.ma.core.MaskError, operator.setitem, orig, (), 1)
diff --git a/numpy/ma/tests/test_extras.py b/numpy/ma/tests/test_extras.py
index a7a32b628..c29bec2bd 100644
--- a/numpy/ma/tests/test_extras.py
+++ b/numpy/ma/tests/test_extras.py
@@ -307,18 +307,6 @@ class TestConcatenator(object):
         assert_array_equal(d[5:,:], b_2)
         assert_array_equal(d.mask, np.r_[m_1, m_2])
 
-    def test_matrix_builder(self):
-        assert_raises(np.ma.MAError, lambda: mr_['1, 2; 3, 4'])
-
-    def test_matrix(self):
-        actual = mr_['r', 1, 2, 3]
-        expected = np.ma.array(np.r_['r', 1, 2, 3])
-        assert_array_equal(actual, expected)
-
-        # outer type is masked array, inner type is matrix
-        assert_equal(type(actual), type(expected))
-        assert_equal(type(actual.data), type(expected.data))
-
     def test_masked_constant(self):
         actual = mr_[np.ma.masked, 1]
         assert_equal(actual.mask, [True, False])
diff --git a/numpy/ma/tests/test_old_ma.py b/numpy/ma/tests/test_old_ma.py
index 70eab0edc..d7b1e3c18 100644
--- a/numpy/ma/tests/test_old_ma.py
+++ b/numpy/ma/tests/test_old_ma.py
@@ -273,7 +273,11 @@ class TestMa(object):
         assert_(y1.mask is m)
 
         y1a = array(y1, copy=0)
-        assert_(y1a.mask is y1.mask)
+        # For copy=False, one might expect that the array would just
+        # passed on, i.e., that it would be "is" instead of "==".
+        # See gh-4043 for discussion.
+        assert_(y1a._mask.__array_interface__ ==
+                y1._mask.__array_interface__)
 
         y2 = array(x1, mask=m3, copy=0)
         assert_(y2.mask is m3)
diff --git a/numpy/ma/tests/test_regression.py b/numpy/ma/tests/test_regression.py
index 04e10d9d1..96c418a51 100644
--- a/numpy/ma/tests/test_regression.py
+++ b/numpy/ma/tests/test_regression.py
@@ -74,3 +74,13 @@ class TestRegression(object):
             r1 = np.ma.corrcoef(x, y, ddof=1)
             # ddof should not have an effect (it gets cancelled out)
             assert_allclose(r0.data, r1.data)
+
+    def test_mask_not_backmangled(self):
+        # See gh-10314.  Test case taken from gh-3140.
+        a = np.ma.MaskedArray([1., 2.], mask=[False, False])
+        assert_(a.mask.shape == (2,))
+        b = np.tile(a, (2, 1))
+        # Check that the above no longer changes a.shape to (1, 2)
+        assert_(a.mask.shape == (2,))
+        assert_(b.shape == (2, 2))
+        assert_(b.mask.shape == (2, 2))
diff --git a/numpy/ma/tests/test_subclassing.py b/numpy/ma/tests/test_subclassing.py
index b61a46278..f8ab52bb9 100644
--- a/numpy/ma/tests/test_subclassing.py
+++ b/numpy/ma/tests/test_subclassing.py
@@ -75,27 +75,6 @@ class MSubArray(SubArray, MaskedArray):
 msubarray = MSubArray
 
 
-class MMatrix(MaskedArray, np.matrix,):
-
-    def __new__(cls, data, mask=nomask):
-        mat = np.matrix(data)
-        _data = MaskedArray.__new__(cls, data=mat, mask=mask)
-        return _data
-
-    def __array_finalize__(self, obj):
-        np.matrix.__array_finalize__(self, obj)
-        MaskedArray.__array_finalize__(self, obj)
-        return
-
-    def _get_series(self):
-        _view = self.view(MaskedArray)
-        _view._sharedmask = False
-        return _view
-    _series = property(fget=_get_series)
-
-mmatrix = MMatrix
-
-
 # Also a subclass that overrides __str__, __repr__ and __setitem__, disallowing
 # setting to non-class values (and thus np.ma.core.masked_print_option)
 # and overrides __array_wrap__, updating the info dict, to check that this
@@ -180,7 +159,7 @@ class TestSubclassing(object):
 
     def setup(self):
         x = np.arange(5, dtype='float')
-        mx = mmatrix(x, mask=[0, 1, 0, 0, 0])
+        mx = msubarray(x, mask=[0, 1, 0, 0, 0])
         self.data = (x, mx)
 
     def test_data_subclassing(self):
@@ -196,34 +175,34 @@ class TestSubclassing(object):
     def test_maskedarray_subclassing(self):
         # Tests subclassing MaskedArray
         (x, mx) = self.data
-        assert_(isinstance(mx._data, np.matrix))
+        assert_(isinstance(mx._data, subarray))
 
     def test_masked_unary_operations(self):
         # Tests masked_unary_operation
         (x, mx) = self.data
         with np.errstate(divide='ignore'):
-            assert_(isinstance(log(mx), mmatrix))
+            assert_(isinstance(log(mx), msubarray))
             assert_equal(log(x), np.log(x))
 
     def test_masked_binary_operations(self):
         # Tests masked_binary_operation
         (x, mx) = self.data
-        # Result should be a mmatrix
-        assert_(isinstance(add(mx, mx), mmatrix))
-        assert_(isinstance(add(mx, x), mmatrix))
+        # Result should be a msubarray
+        assert_(isinstance(add(mx, mx), msubarray))
+        assert_(isinstance(add(mx, x), msubarray))
         # Result should work
         assert_equal(add(mx, x), mx+x)
-        assert_(isinstance(add(mx, mx)._data, np.matrix))
-        assert_(isinstance(add.outer(mx, mx), mmatrix))
-        assert_(isinstance(hypot(mx, mx), mmatrix))
-        assert_(isinstance(hypot(mx, x), mmatrix))
+        assert_(isinstance(add(mx, mx)._data, subarray))
+        assert_(isinstance(add.outer(mx, mx), msubarray))
+        assert_(isinstance(hypot(mx, mx), msubarray))
+        assert_(isinstance(hypot(mx, x), msubarray))
 
     def test_masked_binary_operations2(self):
         # Tests domained_masked_binary_operation
         (x, mx) = self.data
         xmx = masked_array(mx.data.__array__(), mask=mx.mask)
-        assert_(isinstance(divide(mx, mx), mmatrix))
-        assert_(isinstance(divide(mx, x), mmatrix))
+        assert_(isinstance(divide(mx, mx), msubarray))
+        assert_(isinstance(divide(mx, x), msubarray))
         assert_equal(divide(mx, mx), divide(xmx, xmx))
 
     def test_attributepropagation(self):
diff --git a/numpy/matrixlib/defmatrix.py b/numpy/matrixlib/defmatrix.py
index 1f5c94921..7baa401a8 100644
--- a/numpy/matrixlib/defmatrix.py
+++ b/numpy/matrixlib/defmatrix.py
@@ -3,10 +3,14 @@ from __future__ import division, absolute_import, print_function
 __all__ = ['matrix', 'bmat', 'mat', 'asmatrix']
 
 import sys
+import warnings
 import ast
 import numpy.core.numeric as N
-from numpy.core.numeric import concatenate, isscalar, binary_repr, identity, asanyarray
-from numpy.core.numerictypes import issubdtype
+from numpy.core.numeric import concatenate, isscalar
+# While not in __all__, matrix_power used to be defined here, so we import
+# it for backward compatibility.
+from numpy.linalg import matrix_power
+
 
 def _convert_from_string(data):
     for char in '[]':
@@ -63,118 +67,14 @@ def asmatrix(data, dtype=None):
     """
     return matrix(data, dtype=dtype, copy=False)
 
-def matrix_power(M, n):
-    """
-    Raise a square matrix to the (integer) power `n`.
-
-    For positive integers `n`, the power is computed by repeated matrix
-    squarings and matrix multiplications. If ``n == 0``, the identity matrix
-    of the same shape as M is returned. If ``n < 0``, the inverse
-    is computed and then raised to the ``abs(n)``.
-
-    Parameters
-    ----------
-    M : ndarray or matrix object
-        Matrix to be "powered."  Must be square, i.e. ``M.shape == (m, m)``,
-        with `m` a positive integer.
-    n : int
-        The exponent can be any integer or long integer, positive,
-        negative, or zero.
-
-    Returns
-    -------
-    M**n : ndarray or matrix object
-        The return value is the same shape and type as `M`;
-        if the exponent is positive or zero then the type of the
-        elements is the same as those of `M`. If the exponent is
-        negative the elements are floating-point.
-
-    Raises
-    ------
-    LinAlgError
-        If the matrix is not numerically invertible.
-
-    See Also
-    --------
-    matrix
-        Provides an equivalent function as the exponentiation operator
-        (``**``, not ``^``).
-
-    Examples
-    --------
-    >>> from numpy import linalg as LA
-    >>> i = np.array([[0, 1], [-1, 0]]) # matrix equiv. of the imaginary unit
-    >>> LA.matrix_power(i, 3) # should = -i
-    array([[ 0, -1],
-           [ 1,  0]])
-    >>> LA.matrix_power(np.matrix(i), 3) # matrix arg returns matrix
-    matrix([[ 0, -1],
-            [ 1,  0]])
-    >>> LA.matrix_power(i, 0)
-    array([[1, 0],
-           [0, 1]])
-    >>> LA.matrix_power(i, -3) # should = 1/(-i) = i, but w/ f.p. elements
-    array([[ 0.,  1.],
-           [-1.,  0.]])
-
-    Somewhat more sophisticated example
-
-    >>> q = np.zeros((4, 4))
-    >>> q[0:2, 0:2] = -i
-    >>> q[2:4, 2:4] = i
-    >>> q # one of the three quaternion units not equal to 1
-    array([[ 0., -1.,  0.,  0.],
-           [ 1.,  0.,  0.,  0.],
-           [ 0.,  0.,  0.,  1.],
-           [ 0.,  0., -1.,  0.]])
-    >>> LA.matrix_power(q, 2) # = -np.eye(4)
-    array([[-1.,  0.,  0.,  0.],
-           [ 0., -1.,  0.,  0.],
-           [ 0.,  0., -1.,  0.],
-           [ 0.,  0.,  0., -1.]])
-
-    """
-    M = asanyarray(M)
-    if M.ndim != 2 or M.shape[0] != M.shape[1]:
-        raise ValueError("input must be a square array")
-    if not issubdtype(type(n), N.integer):
-        raise TypeError("exponent must be an integer")
-
-    from numpy.linalg import inv
-
-    if n==0:
-        M = M.copy()
-        M[:] = identity(M.shape[0])
-        return M
-    elif n<0:
-        M = inv(M)
-        n *= -1
-
-    result = M
-    if n <= 3:
-        for _ in range(n-1):
-            result=N.dot(result, M)
-        return result
-
-    # binary decomposition to reduce the number of Matrix
-    # multiplications for n > 3.
-    beta = binary_repr(n)
-    Z, q, t = M, 0, len(beta)
-    while beta[t-q-1] == '0':
-        Z = N.dot(Z, Z)
-        q += 1
-    result = Z
-    for k in range(q+1, t):
-        Z = N.dot(Z, Z)
-        if beta[t-k-1] == '1':
-            result = N.dot(result, Z)
-    return result
-
-
 class matrix(N.ndarray):
     """
     matrix(data, dtype=None, copy=True)
 
+    .. note:: It is no longer recommended to use this class, even for linear
+              algebra. Instead use regular arrays. The class may be removed
+              in the future.
+
     Returns a matrix from an array-like object, or from a string of data.
     A matrix is a specialized 2-D array that retains its 2-D nature
     through operations.  It has certain special operators, such as ``*``
@@ -210,6 +110,12 @@ class matrix(N.ndarray):
     """
     __array_priority__ = 10.0
     def __new__(subtype, data, dtype=None, copy=True):
+        warnings.warn('the matrix subclass is not the recommended way to '
+                      'represent matrices or deal with linear algebra (see '
+                      'https://docs.scipy.org/doc/numpy/user/'
+                      'numpy-for-matlab-users.html). '
+                      'Please adjust your code to use regular ndarray.',
+                      PendingDeprecationWarning, stacklevel=2)
         if isinstance(data, matrix):
             dtype2 = data.dtype
             if (dtype is None):
diff --git a/numpy/matrixlib/tests/test_defmatrix.py b/numpy/matrixlib/tests/test_defmatrix.py
index a02a05c09..e74e83cdb 100644
--- a/numpy/matrixlib/tests/test_defmatrix.py
+++ b/numpy/matrixlib/tests/test_defmatrix.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 try:
     # Accessing collections abstract classes from collections
     # has been deprecated since Python 3.3
@@ -13,7 +21,7 @@ from numpy.testing import (
     assert_, assert_equal, assert_almost_equal, assert_array_equal,
     assert_array_almost_equal, assert_raises
     )
-from numpy.matrixlib.defmatrix import matrix_power
+from numpy.linalg import matrix_power
 from numpy.matrixlib import mat
 
 class TestCtor(object):
diff --git a/numpy/matrixlib/tests/test_interaction.py b/numpy/matrixlib/tests/test_interaction.py
new file mode 100644
index 000000000..fb4d8f98c
--- /dev/null
+++ b/numpy/matrixlib/tests/test_interaction.py
@@ -0,0 +1,369 @@
+"""Tests of interaction of matrix with other parts of numpy.
+
+Note that tests with MaskedArray and linalg are done in separate files.
+"""
+from __future__ import division, absolute_import, print_function
+
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
+import textwrap
+import warnings
+
+import numpy as np
+from numpy.testing import (assert_, assert_equal, assert_raises,
+                           assert_raises_regex, assert_array_equal,
+                           assert_almost_equal, assert_array_almost_equal)
+
+
+def test_fancy_indexing():
+    # The matrix class messes with the shape. While this is always
+    # weird (getitem is not used, it does not have setitem nor knows
+    # about fancy indexing), this tests gh-3110
+    # 2018-04-29: moved here from core.tests.test_index.
+    m = np.matrix([[1, 2], [3, 4]])
+
+    assert_(isinstance(m[[0, 1, 0], :], np.matrix))
+
+    # gh-3110. Note the transpose currently because matrices do *not*
+    # support dimension fixing for fancy indexing correctly.
+    x = np.asmatrix(np.arange(50).reshape(5, 10))
+    assert_equal(x[:2, np.array(-1)], x[:2, -1].T)
+
+
+def test_polynomial_mapdomain():
+    # test that polynomial preserved matrix subtype.
+    # 2018-04-29: moved here from polynomial.tests.polyutils.
+    dom1 = [0, 4]
+    dom2 = [1, 3]
+    x = np.matrix([dom1, dom1])
+    res = np.polynomial.polyutils.mapdomain(x, dom1, dom2)
+    assert_(isinstance(res, np.matrix))
+
+
+def test_sort_matrix_none():
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    a = np.matrix([[2, 1, 0]])
+    actual = np.sort(a, axis=None)
+    expected = np.matrix([[0, 1, 2]])
+    assert_equal(actual, expected)
+    assert_(type(expected) is np.matrix)
+
+
+def test_partition_matrix_none():
+    # gh-4301
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    a = np.matrix([[2, 1, 0]])
+    actual = np.partition(a, 1, axis=None)
+    expected = np.matrix([[0, 1, 2]])
+    assert_equal(actual, expected)
+    assert_(type(expected) is np.matrix)
+
+
+def test_dot_scalar_and_matrix_of_objects():
+    # Ticket #2469
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    arr = np.matrix([1, 2], dtype=object)
+    desired = np.matrix([[3, 6]], dtype=object)
+    assert_equal(np.dot(arr, 3), desired)
+    assert_equal(np.dot(3, arr), desired)
+
+
+def test_inner_scalar_and_matrix():
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    for dt in np.typecodes['AllInteger'] + np.typecodes['AllFloat'] + '?':
+        sca = np.array(3, dtype=dt)[()]
+        arr = np.matrix([[1, 2], [3, 4]], dtype=dt)
+        desired = np.matrix([[3, 6], [9, 12]], dtype=dt)
+        assert_equal(np.inner(arr, sca), desired)
+        assert_equal(np.inner(sca, arr), desired)
+
+
+def test_inner_scalar_and_matrix_of_objects():
+    # Ticket #4482
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    arr = np.matrix([1, 2], dtype=object)
+    desired = np.matrix([[3, 6]], dtype=object)
+    assert_equal(np.inner(arr, 3), desired)
+    assert_equal(np.inner(3, arr), desired)
+
+
+def test_iter_allocate_output_subtype():
+    # Make sure that the subtype with priority wins
+    # 2018-04-29: moved here from core.tests.test_nditer, given the
+    # matrix specific shape test.
+
+    # matrix vs ndarray
+    a = np.matrix([[1, 2], [3, 4]])
+    b = np.arange(4).reshape(2, 2).T
+    i = np.nditer([a, b, None], [],
+                  [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+    assert_(type(i.operands[2]) is np.matrix)
+    assert_(type(i.operands[2]) is not np.ndarray)
+    assert_equal(i.operands[2].shape, (2, 2))
+
+    # matrix always wants things to be 2D
+    b = np.arange(4).reshape(1, 2, 2)
+    assert_raises(RuntimeError, np.nditer, [a, b, None], [],
+                  [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+    # but if subtypes are disabled, the result can still work
+    i = np.nditer([a, b, None], [],
+                  [['readonly'], ['readonly'],
+                   ['writeonly', 'allocate', 'no_subtype']])
+    assert_(type(i.operands[2]) is np.ndarray)
+    assert_(type(i.operands[2]) is not np.matrix)
+    assert_equal(i.operands[2].shape, (1, 2, 2))
+
+
+def like_function():
+    # 2018-04-29: moved here from core.tests.test_numeric
+    a = np.matrix([[1, 2], [3, 4]])
+    for like_function in np.zeros_like, np.ones_like, np.empty_like:
+        b = like_function(a)
+        assert_(type(b) is np.matrix)
+
+        c = like_function(a, subok=False)
+        assert_(type(c) is not np.matrix)
+
+
+def test_array_astype():
+    # 2018-04-29: copied here from core.tests.test_api
+    # subok=True passes through a matrix
+    a = np.matrix([[0, 1, 2], [3, 4, 5]], dtype='f4')
+    b = a.astype('f4', subok=True, copy=False)
+    assert_(a is b)
+
+    # subok=True is default, and creates a subtype on a cast
+    b = a.astype('i4', copy=False)
+    assert_equal(a, b)
+    assert_equal(type(b), np.matrix)
+
+    # subok=False never returns a matrix
+    b = a.astype('f4', subok=False, copy=False)
+    assert_equal(a, b)
+    assert_(not (a is b))
+    assert_(type(b) is not np.matrix)
+
+
+def test_stack():
+    # 2018-04-29: copied here from core.tests.test_shape_base
+    # check np.matrix cannot be stacked
+    m = np.matrix([[1, 2], [3, 4]])
+    assert_raises_regex(ValueError, 'shape too large to be a matrix',
+                        np.stack, [m, m])
+
+
+def test_object_scalar_multiply():
+    # Tickets #2469 and #4482
+    # 2018-04-29: moved here from core.tests.test_ufunc
+    arr = np.matrix([1, 2], dtype=object)
+    desired = np.matrix([[3, 6]], dtype=object)
+    assert_equal(np.multiply(arr, 3), desired)
+    assert_equal(np.multiply(3, arr), desired)
+
+
+def test_nanfunctions_matrices():
+    # Check that it works and that type and
+    # shape are preserved
+    # 2018-04-29: moved here from core.tests.test_nanfunctions
+    mat = np.matrix(np.eye(3))
+    for f in [np.nanmin, np.nanmax]:
+        res = f(mat, axis=0)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (1, 3))
+        res = f(mat, axis=1)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 1))
+        res = f(mat)
+        assert_(np.isscalar(res))
+    # check that rows of nan are dealt with for subclasses (#4628)
+    mat[1] = np.nan
+    for f in [np.nanmin, np.nanmax]:
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = f(mat, axis=0)
+            assert_(isinstance(res, np.matrix))
+            assert_(not np.any(np.isnan(res)))
+            assert_(len(w) == 0)
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = f(mat, axis=1)
+            assert_(isinstance(res, np.matrix))
+            assert_(np.isnan(res[1, 0]) and not np.isnan(res[0, 0])
+                    and not np.isnan(res[2, 0]))
+            assert_(len(w) == 1, 'no warning raised')
+            assert_(issubclass(w[0].category, RuntimeWarning))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = f(mat)
+            assert_(np.isscalar(res))
+            assert_(res != np.nan)
+            assert_(len(w) == 0)
+
+
+def test_nanfunctions_matrices_general():
+    # Check that it works and that type and
+    # shape are preserved
+    # 2018-04-29: moved here from core.tests.test_nanfunctions
+    mat = np.matrix(np.eye(3))
+    for f in (np.nanargmin, np.nanargmax, np.nansum, np.nanprod,
+              np.nanmean, np.nanvar, np.nanstd):
+        res = f(mat, axis=0)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (1, 3))
+        res = f(mat, axis=1)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 1))
+        res = f(mat)
+        assert_(np.isscalar(res))
+
+    for f in np.nancumsum, np.nancumprod:
+        res = f(mat, axis=0)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 3))
+        res = f(mat, axis=1)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 3))
+        res = f(mat)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (1, 3*3))
+
+
+def test_average_matrix():
+    # 2018-04-29: moved here from core.tests.test_function_base.
+    y = np.matrix(np.random.rand(5, 5))
+    assert_array_equal(y.mean(0), np.average(y, 0))
+
+    a = np.matrix([[1, 2], [3, 4]])
+    w = np.matrix([[1, 2], [3, 4]])
+
+    r = np.average(a, axis=0, weights=w)
+    assert_equal(type(r), np.matrix)
+    assert_equal(r, [[2.5, 10.0/3]])
+
+
+def test_trapz_matrix():
+    # Test to make sure matrices give the same answer as ndarrays
+    # 2018-04-29: moved here from core.tests.test_function_base.
+    x = np.linspace(0, 5)
+    y = x * x
+    r = np.trapz(y, x)
+    mx = np.matrix(x)
+    my = np.matrix(y)
+    mr = np.trapz(my, mx)
+    assert_almost_equal(mr, r)
+
+
+def test_ediff1d_matrix():
+    # 2018-04-29: moved here from core.tests.test_arraysetops.
+    assert(isinstance(np.ediff1d(np.matrix(1)), np.matrix))
+    assert(isinstance(np.ediff1d(np.matrix(1), to_begin=1), np.matrix))
+
+
+def test_apply_along_axis_matrix():
+    # this test is particularly malicious because matrix
+    # refuses to become 1d
+    # 2018-04-29: moved here from core.tests.test_shape_base.
+    def double(row):
+        return row * 2
+
+    m = np.matrix([[0, 1], [2, 3]])
+    expected = np.matrix([[0, 2], [4, 6]])
+
+    result = np.apply_along_axis(double, 0, m)
+    assert_(isinstance(result, np.matrix))
+    assert_array_equal(result, expected)
+
+    result = np.apply_along_axis(double, 1, m)
+    assert_(isinstance(result, np.matrix))
+    assert_array_equal(result, expected)
+
+
+def test_kron_matrix():
+    # 2018-04-29: moved here from core.tests.test_shape_base.
+    a = np.ones([2, 2])
+    m = np.asmatrix(a)
+    assert_equal(type(np.kron(a, a)), np.ndarray)
+    assert_equal(type(np.kron(m, m)), np.matrix)
+    assert_equal(type(np.kron(a, m)), np.matrix)
+    assert_equal(type(np.kron(m, a)), np.matrix)
+
+
+class TestConcatenatorMatrix(object):
+    # 2018-04-29: moved here from core.tests.test_index_tricks.
+    def test_matrix(self):
+        a = [1, 2]
+        b = [3, 4]
+
+        ab_r = np.r_['r', a, b]
+        ab_c = np.r_['c', a, b]
+
+        assert_equal(type(ab_r), np.matrix)
+        assert_equal(type(ab_c), np.matrix)
+
+        assert_equal(np.array(ab_r), [[1, 2, 3, 4]])
+        assert_equal(np.array(ab_c), [[1], [2], [3], [4]])
+
+        assert_raises(ValueError, lambda: np.r_['rc', a, b])
+
+    def test_matrix_scalar(self):
+        r = np.r_['r', [1, 2], 3]
+        assert_equal(type(r), np.matrix)
+        assert_equal(np.array(r), [[1, 2, 3]])
+
+    def test_matrix_builder(self):
+        a = np.array([1])
+        b = np.array([2])
+        c = np.array([3])
+        d = np.array([4])
+        actual = np.r_['a, b; c, d']
+        expected = np.bmat([[a, b], [c, d]])
+
+        assert_equal(actual, expected)
+        assert_equal(type(actual), type(expected))
+
+
+def test_array_equal_error_message_matrix():
+    # 2018-04-29: moved here from testing.tests.test_utils.
+    try:
+        assert_equal(np.array([1, 2]), np.matrix([1, 2]))
+    except AssertionError as e:
+        msg = str(e)
+        msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
+        msg_reference = textwrap.dedent("""\
+
+        Arrays are not equal
+
+        (shapes (2,), (1, 2) mismatch)
+         x: array([1, 2])
+         y: matrix([[1, 2]])""")
+        try:
+            assert_equal(msg, msg_reference)
+        except AssertionError:
+            assert_equal(msg2, msg_reference)
+    else:
+        raise AssertionError("Did not raise")
+
+
+def test_array_almost_equal_matrix():
+    # Matrix slicing keeps things 2-D, while array does not necessarily.
+    # See gh-8452.
+    # 2018-04-29: moved here from testing.tests.test_utils.
+    m1 = np.matrix([[1., 2.]])
+    m2 = np.matrix([[1., np.nan]])
+    m3 = np.matrix([[1., -np.inf]])
+    m4 = np.matrix([[np.nan, np.inf]])
+    m5 = np.matrix([[1., 2.], [np.nan, np.inf]])
+    for assert_func in assert_array_almost_equal, assert_almost_equal:
+        for m in m1, m2, m3, m4, m5:
+            assert_func(m, m)
+            a = np.array(m)
+            assert_func(a, m)
+            assert_func(m, a)
diff --git a/numpy/matrixlib/tests/test_masked_matrix.py b/numpy/matrixlib/tests/test_masked_matrix.py
new file mode 100644
index 000000000..adc2e5419
--- /dev/null
+++ b/numpy/matrixlib/tests/test_masked_matrix.py
@@ -0,0 +1,239 @@
+from __future__ import division, absolute_import, print_function
+
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
+import pickle
+
+import numpy as np
+from numpy.ma.testutils import (assert_, assert_equal, assert_raises,
+                                assert_array_equal)
+from numpy.ma.core import (masked_array, masked_values, masked, allequal,
+                           MaskType, getmask, MaskedArray, nomask,
+                           log, add, hypot, divide)
+from numpy.ma.extras import mr_
+
+
+class MMatrix(MaskedArray, np.matrix,):
+
+    def __new__(cls, data, mask=nomask):
+        mat = np.matrix(data)
+        _data = MaskedArray.__new__(cls, data=mat, mask=mask)
+        return _data
+
+    def __array_finalize__(self, obj):
+        np.matrix.__array_finalize__(self, obj)
+        MaskedArray.__array_finalize__(self, obj)
+        return
+
+    def _get_series(self):
+        _view = self.view(MaskedArray)
+        _view._sharedmask = False
+        return _view
+    _series = property(fget=_get_series)
+
+
+class TestMaskedMatrix(object):
+    def test_matrix_indexing(self):
+        # Tests conversions and indexing
+        x1 = np.matrix([[1, 2, 3], [4, 3, 2]])
+        x2 = masked_array(x1, mask=[[1, 0, 0], [0, 1, 0]])
+        x3 = masked_array(x1, mask=[[0, 1, 0], [1, 0, 0]])
+        x4 = masked_array(x1)
+        # test conversion to strings
+        str(x2)  # raises?
+        repr(x2)  # raises?
+        # tests of indexing
+        assert_(type(x2[1, 0]) is type(x1[1, 0]))
+        assert_(x1[1, 0] == x2[1, 0])
+        assert_(x2[1, 1] is masked)
+        assert_equal(x1[0, 2], x2[0, 2])
+        assert_equal(x1[0, 1:], x2[0, 1:])
+        assert_equal(x1[:, 2], x2[:, 2])
+        assert_equal(x1[:], x2[:])
+        assert_equal(x1[1:], x3[1:])
+        x1[0, 2] = 9
+        x2[0, 2] = 9
+        assert_equal(x1, x2)
+        x1[0, 1:] = 99
+        x2[0, 1:] = 99
+        assert_equal(x1, x2)
+        x2[0, 1] = masked
+        assert_equal(x1, x2)
+        x2[0, 1:] = masked
+        assert_equal(x1, x2)
+        x2[0, :] = x1[0, :]
+        x2[0, 1] = masked
+        assert_(allequal(getmask(x2), np.array([[0, 1, 0], [0, 1, 0]])))
+        x3[1, :] = masked_array([1, 2, 3], [1, 1, 0])
+        assert_(allequal(getmask(x3)[1], masked_array([1, 1, 0])))
+        assert_(allequal(getmask(x3[1]), masked_array([1, 1, 0])))
+        x4[1, :] = masked_array([1, 2, 3], [1, 1, 0])
+        assert_(allequal(getmask(x4[1]), masked_array([1, 1, 0])))
+        assert_(allequal(x4[1], masked_array([1, 2, 3])))
+        x1 = np.matrix(np.arange(5) * 1.0)
+        x2 = masked_values(x1, 3.0)
+        assert_equal(x1, x2)
+        assert_(allequal(masked_array([0, 0, 0, 1, 0], dtype=MaskType),
+                         x2.mask))
+        assert_equal(3.0, x2.fill_value)
+
+    def test_pickling_subbaseclass(self):
+        # Test pickling w/ a subclass of ndarray
+        a = masked_array(np.matrix(list(range(10))), mask=[1, 0, 1, 0, 0] * 2)
+        a_pickled = pickle.loads(a.dumps())
+        assert_equal(a_pickled._mask, a._mask)
+        assert_equal(a_pickled, a)
+        assert_(isinstance(a_pickled._data, np.matrix))
+
+    def test_count_mean_with_matrix(self):
+        m = masked_array(np.matrix([[1, 2], [3, 4]]), mask=np.zeros((2, 2)))
+
+        assert_equal(m.count(axis=0).shape, (1, 2))
+        assert_equal(m.count(axis=1).shape, (2, 1))
+
+        # Make sure broadcasting inside mean and var work
+        assert_equal(m.mean(axis=0), [[2., 3.]])
+        assert_equal(m.mean(axis=1), [[1.5], [3.5]])
+
+    def test_flat(self):
+        # Test that flat can return items even for matrices [#4585, #4615]
+        # test simple access
+        test = masked_array(np.matrix([[1, 2, 3]]), mask=[0, 0, 1])
+        assert_equal(test.flat[1], 2)
+        assert_equal(test.flat[2], masked)
+        assert_(np.all(test.flat[0:2] == test[0, 0:2]))
+        # Test flat on masked_matrices
+        test = masked_array(np.matrix([[1, 2, 3]]), mask=[0, 0, 1])
+        test.flat = masked_array([3, 2, 1], mask=[1, 0, 0])
+        control = masked_array(np.matrix([[3, 2, 1]]), mask=[1, 0, 0])
+        assert_equal(test, control)
+        # Test setting
+        test = masked_array(np.matrix([[1, 2, 3]]), mask=[0, 0, 1])
+        testflat = test.flat
+        testflat[:] = testflat[[2, 1, 0]]
+        assert_equal(test, control)
+        testflat[0] = 9
+        # test that matrices keep the correct shape (#4615)
+        a = masked_array(np.matrix(np.eye(2)), mask=0)
+        b = a.flat
+        b01 = b[:2]
+        assert_equal(b01.data, np.array([[1., 0.]]))
+        assert_equal(b01.mask, np.array([[False, False]]))
+
+    def test_allany_onmatrices(self):
+        x = np.array([[0.13, 0.26, 0.90],
+                      [0.28, 0.33, 0.63],
+                      [0.31, 0.87, 0.70]])
+        X = np.matrix(x)
+        m = np.array([[True, False, False],
+                      [False, False, False],
+                      [True, True, False]], dtype=np.bool_)
+        mX = masked_array(X, mask=m)
+        mXbig = (mX > 0.5)
+        mXsmall = (mX < 0.5)
+
+        assert_(not mXbig.all())
+        assert_(mXbig.any())
+        assert_equal(mXbig.all(0), np.matrix([False, False, True]))
+        assert_equal(mXbig.all(1), np.matrix([False, False, True]).T)
+        assert_equal(mXbig.any(0), np.matrix([False, False, True]))
+        assert_equal(mXbig.any(1), np.matrix([True, True, True]).T)
+
+        assert_(not mXsmall.all())
+        assert_(mXsmall.any())
+        assert_equal(mXsmall.all(0), np.matrix([True, True, False]))
+        assert_equal(mXsmall.all(1), np.matrix([False, False, False]).T)
+        assert_equal(mXsmall.any(0), np.matrix([True, True, False]))
+        assert_equal(mXsmall.any(1), np.matrix([True, True, False]).T)
+
+    def test_compressed(self):
+        a = masked_array(np.matrix([1, 2, 3, 4]), mask=[0, 0, 0, 0])
+        b = a.compressed()
+        assert_equal(b, a)
+        assert_(isinstance(b, np.matrix))
+        a[0, 0] = masked
+        b = a.compressed()
+        assert_equal(b, [[2, 3, 4]])
+
+    def test_ravel(self):
+        a = masked_array(np.matrix([1, 2, 3, 4, 5]), mask=[[0, 1, 0, 0, 0]])
+        aravel = a.ravel()
+        assert_equal(aravel.shape, (1, 5))
+        assert_equal(aravel._mask.shape, a.shape)
+
+    def test_view(self):
+        # Test view w/ flexible dtype
+        iterator = list(zip(np.arange(10), np.random.rand(10)))
+        data = np.array(iterator)
+        a = masked_array(iterator, dtype=[('a', float), ('b', float)])
+        a.mask[0] = (1, 0)
+        test = a.view((float, 2), np.matrix)
+        assert_equal(test, data)
+        assert_(isinstance(test, np.matrix))
+        assert_(not isinstance(test, MaskedArray))
+
+
+class TestSubclassing(object):
+    # Test suite for masked subclasses of ndarray.
+
+    def setup(self):
+        x = np.arange(5, dtype='float')
+        mx = MMatrix(x, mask=[0, 1, 0, 0, 0])
+        self.data = (x, mx)
+
+    def test_maskedarray_subclassing(self):
+        # Tests subclassing MaskedArray
+        (x, mx) = self.data
+        assert_(isinstance(mx._data, np.matrix))
+
+    def test_masked_unary_operations(self):
+        # Tests masked_unary_operation
+        (x, mx) = self.data
+        with np.errstate(divide='ignore'):
+            assert_(isinstance(log(mx), MMatrix))
+            assert_equal(log(x), np.log(x))
+
+    def test_masked_binary_operations(self):
+        # Tests masked_binary_operation
+        (x, mx) = self.data
+        # Result should be a MMatrix
+        assert_(isinstance(add(mx, mx), MMatrix))
+        assert_(isinstance(add(mx, x), MMatrix))
+        # Result should work
+        assert_equal(add(mx, x), mx+x)
+        assert_(isinstance(add(mx, mx)._data, np.matrix))
+        assert_(isinstance(add.outer(mx, mx), MMatrix))
+        assert_(isinstance(hypot(mx, mx), MMatrix))
+        assert_(isinstance(hypot(mx, x), MMatrix))
+
+    def test_masked_binary_operations2(self):
+        # Tests domained_masked_binary_operation
+        (x, mx) = self.data
+        xmx = masked_array(mx.data.__array__(), mask=mx.mask)
+        assert_(isinstance(divide(mx, mx), MMatrix))
+        assert_(isinstance(divide(mx, x), MMatrix))
+        assert_equal(divide(mx, mx), divide(xmx, xmx))
+
+class TestConcatenator(object):
+    # Tests for mr_, the equivalent of r_ for masked arrays.
+
+    def test_matrix_builder(self):
+        assert_raises(np.ma.MAError, lambda: mr_['1, 2; 3, 4'])
+
+    def test_matrix(self):
+        # Test consistency with unmasked version.  If we ever deprecate
+        # matrix, this test should either still pass, or both actual and
+        # expected should fail to be build.
+        actual = mr_['r', 1, 2, 3]
+        expected = np.ma.array(np.r_['r', 1, 2, 3])
+        assert_array_equal(actual, expected)
+
+        # outer type is masked array, inner type is matrix
+        assert_equal(type(actual), type(expected))
+        assert_equal(type(actual.data), type(expected.data))
diff --git a/numpy/matrixlib/tests/test_matrix_linalg.py b/numpy/matrixlib/tests/test_matrix_linalg.py
new file mode 100644
index 000000000..85c7693b4
--- /dev/null
+++ b/numpy/matrixlib/tests/test_matrix_linalg.py
@@ -0,0 +1,103 @@
+""" Test functions for linalg module using the matrix class."""
+from __future__ import division, absolute_import, print_function
+
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
+import numpy as np
+
+from numpy.linalg.tests.test_linalg import (
+    LinalgCase, apply_tag, TestQR as _TestQR, LinalgTestCase,
+    _TestNorm2D, _TestNormDoubleBase, _TestNormSingleBase, _TestNormInt64Base,
+    SolveCases, InvCases, EigvalsCases, EigCases, SVDCases, CondCases,
+    PinvCases, DetCases, LstsqCases)
+
+
+CASES = []
+
+# square test cases
+CASES += apply_tag('square', [
+    LinalgCase("0x0_matrix",
+               np.empty((0, 0), dtype=np.double).view(np.matrix),
+               np.empty((0, 1), dtype=np.double).view(np.matrix),
+               tags={'size-0'}),
+    LinalgCase("matrix_b_only",
+               np.array([[1., 2.], [3., 4.]]),
+               np.matrix([2., 1.]).T),
+    LinalgCase("matrix_a_and_b",
+               np.matrix([[1., 2.], [3., 4.]]),
+               np.matrix([2., 1.]).T),
+])
+
+# hermitian test-cases
+CASES += apply_tag('hermitian', [
+    LinalgCase("hmatrix_a_and_b",
+               np.matrix([[1., 2.], [2., 1.]]),
+               None),
+])
+# No need to make generalized or strided cases for matrices.
+
+
+class MatrixTestCase(LinalgTestCase):
+    TEST_CASES = CASES
+
+
+class TestSolveMatrix(SolveCases, MatrixTestCase):
+    pass
+
+
+class TestInvMatrix(InvCases, MatrixTestCase):
+    pass
+
+
+class TestEigvalsMatrix(EigvalsCases, MatrixTestCase):
+    pass
+
+
+class TestEigMatrix(EigCases, MatrixTestCase):
+    pass
+
+
+class TestSVDMatrix(SVDCases, MatrixTestCase):
+    pass
+
+
+class TestCondMatrix(CondCases, MatrixTestCase):
+    pass
+
+
+class TestPinvMatrix(PinvCases, MatrixTestCase):
+    pass
+
+
+class TestDetMatrix(DetCases, MatrixTestCase):
+    pass
+
+
+class TestLstsqMatrix(LstsqCases, MatrixTestCase):
+    pass
+
+
+class _TestNorm2DMatrix(_TestNorm2D):
+    array = np.matrix
+
+
+class TestNormDoubleMatrix(_TestNorm2DMatrix, _TestNormDoubleBase):
+    pass
+
+
+class TestNormSingleMatrix(_TestNorm2DMatrix, _TestNormSingleBase):
+    pass
+
+
+class TestNormInt64Matrix(_TestNorm2DMatrix, _TestNormInt64Base):
+    pass
+
+
+class TestQRMatrix(_TestQR):
+    array = np.matrix
diff --git a/numpy/matrixlib/tests/test_multiarray.py b/numpy/matrixlib/tests/test_multiarray.py
index 6d84bd477..2f04b49d6 100644
--- a/numpy/matrixlib/tests/test_multiarray.py
+++ b/numpy/matrixlib/tests/test_multiarray.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import numpy as np
 from numpy.testing import assert_, assert_equal, assert_array_equal
 
diff --git a/numpy/matrixlib/tests/test_numeric.py b/numpy/matrixlib/tests/test_numeric.py
index 95e1c8001..cfdada126 100644
--- a/numpy/matrixlib/tests/test_numeric.py
+++ b/numpy/matrixlib/tests/test_numeric.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import numpy as np
 from numpy.testing import assert_equal
 
diff --git a/numpy/matrixlib/tests/test_regression.py b/numpy/matrixlib/tests/test_regression.py
index 70e147279..439704ccf 100644
--- a/numpy/matrixlib/tests/test_regression.py
+++ b/numpy/matrixlib/tests/test_regression.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import numpy as np
 from numpy.testing import assert_, assert_equal, assert_raises
 
diff --git a/numpy/polynomial/tests/test_polyutils.py b/numpy/polynomial/tests/test_polyutils.py
index 32ea55716..801c558cc 100644
--- a/numpy/polynomial/tests/test_polyutils.py
+++ b/numpy/polynomial/tests/test_polyutils.py
@@ -63,7 +63,7 @@ class TestDomain(object):
         dom1 = [0, 4]
         dom2 = [1, 3]
         tgt = dom2
-        res = pu. mapdomain(dom1, dom1, dom2)
+        res = pu.mapdomain(dom1, dom1, dom2)
         assert_almost_equal(res, tgt)
 
         # test for complex values
@@ -83,11 +83,14 @@ class TestDomain(object):
         assert_almost_equal(res, tgt)
 
         # test that subtypes are preserved.
+        class MyNDArray(np.ndarray):
+            pass
+
         dom1 = [0, 4]
         dom2 = [1, 3]
-        x = np.matrix([dom1, dom1])
+        x = np.array([dom1, dom1]).view(MyNDArray)
         res = pu.mapdomain(x, dom1, dom2)
-        assert_(isinstance(res, np.matrix))
+        assert_(isinstance(res, MyNDArray))
 
     def test_mapparms(self):
         # test for real values
diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx
index 8ef153c15..b45b3146f 100644
--- a/numpy/random/mtrand/mtrand.pyx
+++ b/numpy/random/mtrand/mtrand.pyx
@@ -4901,10 +4901,24 @@ cdef class RandomState:
         """
         if isinstance(x, (int, long, np.integer)):
             arr = np.arange(x)
-        else:
-            arr = np.array(x)
-        self.shuffle(arr)
-        return arr
+            self.shuffle(arr)
+            return arr
+
+        arr = np.asarray(x)
+    
+        # shuffle has fast-path for 1-d
+        if arr.ndim == 1:
+            # must return a copy
+            if arr is x:
+                arr = np.array(arr)
+            self.shuffle(arr)
+            return arr
+
+        # Shuffle index array, dtype to ensure fast path
+        idx = np.arange(arr.shape[0], dtype=np.intp)
+        self.shuffle(idx)
+        return arr[idx]
+        
 
 _rand = RandomState()
 seed = _rand.seed
diff --git a/numpy/testing/_private/decorators.py b/numpy/testing/_private/decorators.py
index 60d3f968f..24c4e385d 100644
--- a/numpy/testing/_private/decorators.py
+++ b/numpy/testing/_private/decorators.py
@@ -34,7 +34,7 @@ def slow(t):
 
     The exact definition of a slow test is obviously both subjective and
     hardware-dependent, but in general any individual test that requires more
-    than a second or two should be labeled as slow (the whole suite consits of
+    than a second or two should be labeled as slow (the whole suite consists of
     thousands of tests, so even a second is significant).
 
     Parameters
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index b0c0b0c48..032c4a116 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -15,6 +15,7 @@ import shutil
 import contextlib
 from tempfile import mkdtemp, mkstemp
 from unittest.case import SkipTest
+from warnings import WarningMessage
 import pprint
 
 from numpy.core import(
@@ -685,7 +686,7 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
                          header='', precision=6, equal_nan=True,
                          equal_inf=True):
     __tracebackhide__ = True  # Hide traceback for py.test
-    from numpy.core import array, isnan, isinf, any, inf
+    from numpy.core import array, isnan, inf, bool_
     x = array(x, copy=False, subok=True)
     y = array(y, copy=False, subok=True)
 
@@ -695,17 +696,28 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
     def istime(x):
         return x.dtype.char in "Mm"
 
-    def chk_same_position(x_id, y_id, hasval='nan'):
-        """Handling nan/inf: check that x and y have the nan/inf at the same
-        locations."""
-        try:
-            assert_array_equal(x_id, y_id)
-        except AssertionError:
+    def func_assert_same_pos(x, y, func=isnan, hasval='nan'):
+        """Handling nan/inf: combine results of running func on x and y,
+        checking that they are True at the same locations."""
+        # Both the != True comparison here and the cast to bool_ at
+        # the end are done to deal with `masked`, which cannot be
+        # compared usefully, and for which .all() yields masked.
+        x_id = func(x)
+        y_id = func(y)
+        if (x_id == y_id).all() != True:
             msg = build_err_msg([x, y],
                                 err_msg + '\nx and y %s location mismatch:'
                                 % (hasval), verbose=verbose, header=header,
                                 names=('x', 'y'), precision=precision)
             raise AssertionError(msg)
+        # If there is a scalar, then here we know the array has the same
+        # flag as it everywhere, so we should return the scalar flag.
+        if x_id.ndim == 0:
+            return bool_(x_id)
+        elif y_id.ndim == 0:
+            return bool_(y_id)
+        else:
+            return y_id
 
     try:
         cond = (x.shape == () or y.shape == ()) or x.shape == y.shape
@@ -718,49 +730,32 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
                                 names=('x', 'y'), precision=precision)
             raise AssertionError(msg)
 
+        flagged = bool_(False)
         if isnumber(x) and isnumber(y):
-            has_nan = has_inf = False
             if equal_nan:
-                x_isnan, y_isnan = isnan(x), isnan(y)
-                # Validate that NaNs are in the same place
-                has_nan = any(x_isnan) or any(y_isnan)
-                if has_nan:
-                    chk_same_position(x_isnan, y_isnan, hasval='nan')
+                flagged = func_assert_same_pos(x, y, func=isnan, hasval='nan')
 
             if equal_inf:
-                x_isinf, y_isinf = isinf(x), isinf(y)
-                # Validate that infinite values are in the same place
-                has_inf = any(x_isinf) or any(y_isinf)
-                if has_inf:
-                    # Check +inf and -inf separately, since they are different
-                    chk_same_position(x == +inf, y == +inf, hasval='+inf')
-                    chk_same_position(x == -inf, y == -inf, hasval='-inf')
-
-            if has_nan and has_inf:
-                x = x[~(x_isnan | x_isinf)]
-                y = y[~(y_isnan | y_isinf)]
-            elif has_nan:
-                x = x[~x_isnan]
-                y = y[~y_isnan]
-            elif has_inf:
-                x = x[~x_isinf]
-                y = y[~y_isinf]
-
-            # Only do the comparison if actual values are left
-            if x.size == 0:
-                return
+                flagged |= func_assert_same_pos(x, y,
+                                                func=lambda xy: xy == +inf,
+                                                hasval='+inf')
+                flagged |= func_assert_same_pos(x, y,
+                                                func=lambda xy: xy == -inf,
+                                                hasval='-inf')
 
         elif istime(x) and istime(y):
             # If one is datetime64 and the other timedelta64 there is no point
             if equal_nan and x.dtype.type == y.dtype.type:
-                x_isnat, y_isnat = isnat(x), isnat(y)
+                flagged = func_assert_same_pos(x, y, func=isnat, hasval="NaT")
 
-                if any(x_isnat) or any(y_isnat):
-                    chk_same_position(x_isnat, y_isnat, hasval="NaT")
-
-                if any(x_isnat) or any(y_isnat):
-                    x = x[~x_isnat]
-                    y = y[~y_isnat]
+        if flagged.ndim > 0:
+            x, y = x[~flagged], y[~flagged]
+            # Only do the comparison if actual values are left
+            if x.size == 0:
+                return
+        elif flagged:
+            # no sense doing comparison if everything is flagged.
+            return
 
         val = comparison(x, y)
 
@@ -771,7 +766,11 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
             reduced = val.ravel()
             cond = reduced.all()
             reduced = reduced.tolist()
-        if not cond:
+        # The below comparison is a hack to ensure that fully masked
+        # results, for which val.ravel().all() returns np.ma.masked,
+        # do not trigger a failure (np.ma.masked != True evaluates as
+        # np.ma.masked, which is falsy).
+        if cond != True:
             match = 100-100.0*reduced.count(1)/len(reduced)
             msg = build_err_msg([x, y],
                                 err_msg
@@ -1369,16 +1368,20 @@ def _assert_valid_refcount(op):
     """
     if not HAS_REFCOUNT:
         return True
-    import numpy as np
+    import numpy as np, gc
 
     b = np.arange(100*100).reshape(100, 100)
     c = b
     i = 1
 
-    rc = sys.getrefcount(i)
-    for j in range(15):
-        d = op(b, c)
-    assert_(sys.getrefcount(i) >= rc)
+    gc.disable()
+    try:
+        rc = sys.getrefcount(i)
+        for j in range(15):
+            d = op(b, c)
+        assert_(sys.getrefcount(i) >= rc)
+    finally:
+        gc.enable()
     del d  # for pyflakes
 
 
@@ -1631,98 +1634,6 @@ def integer_repr(x):
         raise ValueError("Unsupported dtype %s" % x.dtype)
 
 
-# The following two classes are copied from python 2.6 warnings module (context
-# manager)
-class WarningMessage(object):
-
-    """
-    Holds the result of a single showwarning() call.
-
-    Deprecated in 1.8.0
-
-    Notes
-    -----
-    `WarningMessage` is copied from the Python 2.6 warnings module,
-    so it can be used in NumPy with older Python versions.
-
-    """
-
-    _WARNING_DETAILS = ("message", "category", "filename", "lineno", "file",
-                        "line")
-
-    def __init__(self, message, category, filename, lineno, file=None,
-                    line=None):
-        local_values = locals()
-        for attr in self._WARNING_DETAILS:
-            setattr(self, attr, local_values[attr])
-        if category:
-            self._category_name = category.__name__
-        else:
-            self._category_name = None
-
-    def __str__(self):
-        return ("{message : %r, category : %r, filename : %r, lineno : %s, "
-                    "line : %r}" % (self.message, self._category_name,
-                                    self.filename, self.lineno, self.line))
-
-
-class WarningManager(object):
-    """
-    A context manager that copies and restores the warnings filter upon
-    exiting the context.
-
-    The 'record' argument specifies whether warnings should be captured by a
-    custom implementation of ``warnings.showwarning()`` and be appended to a
-    list returned by the context manager. Otherwise None is returned by the
-    context manager. The objects appended to the list are arguments whose
-    attributes mirror the arguments to ``showwarning()``.
-
-    The 'module' argument is to specify an alternative module to the module
-    named 'warnings' and imported under that name. This argument is only useful
-    when testing the warnings module itself.
-
-    Deprecated in 1.8.0
-
-    Notes
-    -----
-    `WarningManager` is a copy of the ``catch_warnings`` context manager
-    from the Python 2.6 warnings module, with slight modifications.
-    It is copied so it can be used in NumPy with older Python versions.
-
-    """
-
-    def __init__(self, record=False, module=None):
-        self._record = record
-        if module is None:
-            self._module = sys.modules['warnings']
-        else:
-            self._module = module
-        self._entered = False
-
-    def __enter__(self):
-        if self._entered:
-            raise RuntimeError("Cannot enter %r twice" % self)
-        self._entered = True
-        self._filters = self._module.filters
-        self._module.filters = self._filters[:]
-        self._showwarning = self._module.showwarning
-        if self._record:
-            log = []
-
-            def showwarning(*args, **kwargs):
-                log.append(WarningMessage(*args, **kwargs))
-            self._module.showwarning = showwarning
-            return log
-        else:
-            return None
-
-    def __exit__(self):
-        if not self._entered:
-            raise RuntimeError("Cannot exit %r without entering first" % self)
-        self._module.filters = self._filters
-        self._module.showwarning = self._showwarning
-
-
 @contextlib.contextmanager
 def _assert_warns_context(warning_class, name=None):
     __tracebackhide__ = True  # Hide traceback for py.test
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 0592e62f8..465c217d4 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -151,6 +151,17 @@ class TestArrayEqual(_GenericTest):
             self._test_not_equal(c, b)
             assert_equal(len(l), 1)
 
+    def test_masked_nan_inf(self):
+        # Regression test for gh-11121
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[False, True, False])
+        b = np.array([3., np.nan, 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[True, False, False])
+        b = np.array([np.inf, 4., 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+
 
 class TestBuildErrorMessage(object):
 
@@ -286,7 +297,7 @@ class TestEqual(TestArrayEqual):
 
     def test_error_message(self):
         try:
-            self._assert_func(np.array([1, 2]), np.matrix([1, 2]))
+            self._assert_func(np.array([1, 2]), np.array([[1, 2]]))
         except AssertionError as e:
             msg = str(e)
             msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
@@ -296,7 +307,7 @@ class TestEqual(TestArrayEqual):
 
             (shapes (2,), (1, 2) mismatch)
              x: array([1, 2])
-             y: matrix([[1, 2]])""")
+             y: array([[1, 2]])""")
             try:
                 assert_equal(msg, msg_reference)
             except AssertionError:
@@ -366,19 +377,23 @@ class TestArrayAlmostEqual(_GenericTest):
         self._assert_func(b, a)
         self._assert_func(b, b)
 
-    def test_matrix(self):
-        # Matrix slicing keeps things 2-D, while array does not necessarily.
-        # See gh-8452.
-        m1 = np.matrix([[1., 2.]])
-        m2 = np.matrix([[1., np.nan]])
-        m3 = np.matrix([[1., -np.inf]])
-        m4 = np.matrix([[np.nan, np.inf]])
-        m5 = np.matrix([[1., 2.], [np.nan, np.inf]])
-        for m in m1, m2, m3, m4, m5:
-            self._assert_func(m, m)
-            a = np.array(m)
-            self._assert_func(a, m)
-            self._assert_func(m, a)
+        # Test fully masked as well (see gh-11123).
+        a = np.ma.MaskedArray(3.5, mask=True)
+        b = np.array([3., 4., 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.masked
+        b = np.array([3., 4., 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[True, True, True])
+        b = np.array([1., 2., 3.])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[True, True, True])
+        b = np.array(1.)
+        self._test_equal(a, b)
+        self._test_equal(b, a)
 
     def test_subclass_that_cannot_be_bool(self):
         # While we cannot guarantee testing functions will always work for
@@ -386,6 +401,9 @@ class TestArrayAlmostEqual(_GenericTest):
         # comparison operators, not on them being able to store booleans
         # (which, e.g., astropy Quantity cannot usefully do). See gh-8452.
         class MyArray(np.ndarray):
+            def __eq__(self, other):
+                return super(MyArray, self).__eq__(other).view(np.ndarray)
+
             def __lt__(self, other):
                 return super(MyArray, self).__lt__(other).view(np.ndarray)
 
@@ -479,26 +497,15 @@ class TestAlmostEqual(_GenericTest):
             # remove anything that's not the array string
             assert_equal(str(e).split('%)\n ')[1], b)
 
-    def test_matrix(self):
-        # Matrix slicing keeps things 2-D, while array does not necessarily.
-        # See gh-8452.
-        m1 = np.matrix([[1., 2.]])
-        m2 = np.matrix([[1., np.nan]])
-        m3 = np.matrix([[1., -np.inf]])
-        m4 = np.matrix([[np.nan, np.inf]])
-        m5 = np.matrix([[1., 2.], [np.nan, np.inf]])
-        for m in m1, m2, m3, m4, m5:
-            self._assert_func(m, m)
-            a = np.array(m)
-            self._assert_func(a, m)
-            self._assert_func(m, a)
-
     def test_subclass_that_cannot_be_bool(self):
         # While we cannot guarantee testing functions will always work for
         # subclasses, the tests should ideally rely only on subclasses having
         # comparison operators, not on them being able to store booleans
         # (which, e.g., astropy Quantity cannot usefully do). See gh-8452.
         class MyArray(np.ndarray):
+            def __eq__(self, other):
+                return super(MyArray, self).__eq__(other).view(np.ndarray)
+
             def __lt__(self, other):
                 return super(MyArray, self).__lt__(other).view(np.ndarray)
 
@@ -660,6 +667,7 @@ class TestArrayAssertLess(object):
         assert_raises(AssertionError, lambda: self._assert_func(-ainf, -x))
         self._assert_func(-ainf, x)
 
+
 @pytest.mark.skip(reason="The raises decorator depends on Nose")
 class TestRaises(object):
 
diff --git a/numpy/tests/test_matlib.py b/numpy/tests/test_matlib.py
index 12116b883..38a7e39df 100644
--- a/numpy/tests/test_matlib.py
+++ b/numpy/tests/test_matlib.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import numpy as np
 import numpy.matlib
 from numpy.testing import assert_array_equal, assert_
diff --git a/pavement.py b/pavement.py
index 0065c142b..3484e8029 100644
--- a/pavement.py
+++ b/pavement.py
@@ -571,14 +571,14 @@ def compute_md5(idirs):
     return _compute_hash(idirs, hashlib.md5)
 
 def compute_sha256(idirs):
-    # better checksum so gpg signed README.txt containing the sums can be used
+    # better checksum so gpg signed README.rst containing the sums can be used
     # to verify the binaries instead of signing all binaries
     return _compute_hash(idirs, hashlib.sha256)
 
 def write_release_task(options, filename='README'):
     idirs = options.installers.installersdir
     source = paver.path.path(RELEASE_NOTES)
-    target = paver.path.path(filename)
+    target = paver.path.path(filename + '.rst')
     if target.exists():
         target.remove()
 
diff --git a/runtests.py b/runtests.py
index 12e3f2886..35717b319 100755
--- a/runtests.py
+++ b/runtests.py
@@ -311,6 +311,8 @@ def build_project(args):
 
     """
 
+    import distutils.sysconfig
+
     root_ok = [os.path.exists(os.path.join(ROOT_DIR, fn))
                for fn in PROJECT_ROOT_FILES]
     if not all(root_ok):
@@ -325,14 +327,25 @@ def build_project(args):
 
     # Always use ccache, if installed
     env['PATH'] = os.pathsep.join(EXTRA_PATH + env.get('PATH', '').split(os.pathsep))
-
+    cvars = distutils.sysconfig.get_config_vars()
+    if 'gcc' in cvars.get('CC', ''):
+        # add flags used as werrors
+        warnings_as_errors = ' '.join([
+            # from tools/travis-test.sh
+            '-Werror=declaration-after-statement',
+            '-Werror=vla',
+            '-Werror=nonnull',
+            '-Werror=pointer-arith',
+            '-Wlogical-op',
+            # from sysconfig
+            '-Werror=unused-function',
+        ])
+        env['CFLAGS'] = warnings_as_errors + ' ' + env.get('CFLAGS', '')
     if args.debug or args.gcov:
         # assume everyone uses gcc/gfortran
         env['OPT'] = '-O0 -ggdb'
         env['FOPT'] = '-O0 -ggdb'
         if args.gcov:
-            import distutils.sysconfig
-            cvars = distutils.sysconfig.get_config_vars()
             env['OPT'] = '-O0 -ggdb'
             env['FOPT'] = '-O0 -ggdb'
             env['CC'] = cvars['CC'] + ' --coverage'
diff --git a/site.cfg.example b/site.cfg.example
index 645b48543..21609a332 100644
--- a/site.cfg.example
+++ b/site.cfg.example
@@ -180,6 +180,14 @@
 # mkl_libs = mkl_rt 
 # lapack_libs =
 
+# ACCELERATE
+# ----------
+# Accelerate/vecLib is an OSX framework providing a BLAS and LAPACK implementations.
+#
+# [accelerate]
+# libraries = Accelerate, vecLib
+# #libraries = None
+
 # UMFPACK
 # -------
 # The UMFPACK library is used in scikits.umfpack to factor large sparse matrices.