81 files changed, 1441 insertions, 376 deletions
diff --git a/README.md b/README.md
index ae92b6309..276261a9f 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 NumPy is the fundamental package needed for scientific computing with Python.
 
 - **Website:** https://www.numpy.org
-- **Documentation:** https://docs.scipy.org/
+- **Documentation:** https://numpy.org/doc
 - **Mailing list:** https://mail.python.org/mailman/listinfo/numpy-discussion
 - **Source code:** https://github.com/numpy/numpy
 - **Contributing:** https://www.numpy.org/devdocs/dev/index.html
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 7b23fef61..564f5d8e8 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -40,7 +40,6 @@ stages:
             /bin/bash -xc "cd numpy && \
             /opt/python/cp38-cp38/bin/python -mvenv venv &&\
             source venv/bin/activate && \
-            python3 -m pip install urllib3 && \
             target=\$(python3 tools/openblas_support.py) && \
             cp -r \$target/lib/* /usr/lib && \
             cp \$target/include/* /usr/include && \
@@ -122,7 +121,6 @@ stages:
     # primarily on file size / name details
     - script: |
         set -xe
-        python -m pip install urllib3
         target=$(python tools/openblas_support.py)
         ls -lR $target
         # manually link to appropriate system paths
diff --git a/azure-steps-windows.yml b/azure-steps-windows.yml
index eddc6a9b9..c39faddf9 100644
--- a/azure-steps-windows.yml
+++ b/azure-steps-windows.yml
@@ -9,13 +9,21 @@ steps:
 - script: python -m pip install -r test_requirements.txt
   displayName: 'Install dependencies; some are optional to avoid test skips'
 - powershell: |
-    $pyversion = python -c "import sys; print(sys.version.split()[0])"
-    Write-Host "Python Version: $pyversion"
-    $target = "C:\\hostedtoolcache\\windows\\Python\\$pyversion\\$(PYTHON_ARCH)\\lib\\openblas$env:OPENBLAS_SUFFIX.a"
-    Write-Host "target path: $target"
-    python -mpip install urllib3
-    $openblas = python tools/openblas_support.py
-    cp $openblas $target
+    $ErrorActionPreference = "Stop"
+    # Download and get the path to "openblas.a". We cannot copy it
+    # to $PYTHON_EXE's directory since that is on a different drive which
+    # mingw does not like. Instead copy it to a directory and set OPENBLAS,
+    # since OPENBLAS will be picked up by the openblas discovery
+    python -m pip  install
+    $target = $(python tools/openblas_support.py)
+    mkdir openblas
+    echo Copying $target to openblas/openblas$env:OPENBLAS_SUFFIX.a
+    cp $target openblas/openblas$env:OPENBLAS_SUFFIX.a
+    If ( Test-Path env:NPY_USE_BLAS_ILP64 ){
+        echo "##vso[task.setvariable variable=OPENBLAS64_]$pwd\openblas"
+    } else {
+        echo "##vso[task.setvariable variable=OPENBLAS]$pwd\openblas"
+    }
   displayName: 'Download / Install OpenBLAS'
 
 - powershell: |
@@ -32,7 +40,7 @@ steps:
         refreshenv
     }
     python -c "from tools import openblas_support; openblas_support.make_init('numpy')"
-    pip wheel -v -v -v --wheel-dir=dist .
+    pip wheel -v -v -v --no-build-isolation --no-use-pep517 --wheel-dir=dist .
 
     ls dist -r | Foreach-Object {
         pip install $_.FullName
@@ -40,7 +48,7 @@ steps:
   displayName: 'Build NumPy'
 - bash: |
     pushd . && cd .. && target=$(python -c "import numpy, os; print(os.path.abspath(os.path.join(os.path.dirname(numpy.__file__), '.libs')))") && popd
-    pip download -d destination --only-binary --no-deps numpy==1.14
+    pip download -d destination --only-binary :all: --no-deps numpy==1.14
     cd destination && unzip numpy*.whl && cp numpy/.libs/*.dll $target
     ls $target
   displayName: 'Add extraneous & older DLL to numpy/.libs to probe DLL handling robustness'
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
index 1046f10f2..d9222d495 100644
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -15,7 +15,7 @@
 
     // List of branches to benchmark. If not provided, defaults to "master"
     // (for git) or "tip" (for mercurial).
-    "branches": ["master"],
+    "branches": ["HEAD"],
 
     // The DVCS being used.  If not set, it will be automatically
     // determined from "repo" by looking at the protocol in the URL
diff --git a/doc/TESTS.rst.txt b/doc/TESTS.rst.txt
index 007840b39..af47fe99c 100644
--- a/doc/TESTS.rst.txt
+++ b/doc/TESTS.rst.txt
@@ -256,7 +256,7 @@ section of your setup.py::
   ...
   def configuration(parent_package='', top_path=None):
       ...
-      config.add_data_dir('tests')
+      config.add_subpackage('tests')
       return config
   ...
 
diff --git a/doc/changelog/1.18.4-changelog.rst b/doc/changelog/1.18.4-changelog.rst
new file mode 100644
index 000000000..f3524b5f5
--- /dev/null
+++ b/doc/changelog/1.18.4-changelog.rst
@@ -0,0 +1,23 @@
+
+Contributors
+============
+
+A total of 4 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Charles Harris
+* Matti Picus
+* Sebastian Berg
+* Warren Weckesser
+
+Pull requests merged
+====================
+
+A total of 6 pull requests were merged for this release.
+
+* `#16055 <https://github.com/numpy/numpy/pull/16055>`__: BLD: add i686 for 1.18 builds
+* `#16090 <https://github.com/numpy/numpy/pull/16090>`__: BUG: random: ``Generator.integers(2**32)`` always returned 0.
+* `#16091 <https://github.com/numpy/numpy/pull/16091>`__: BLD: fix path to libgfortran on macOS
+* `#16109 <https://github.com/numpy/numpy/pull/16109>`__: REV: Reverts side-effect changes to casting
+* `#16114 <https://github.com/numpy/numpy/pull/16114>`__: BLD: put openblas library in local directory on windows
+* `#16132 <https://github.com/numpy/numpy/pull/16132>`__: DOC: Change import error "howto" to link to new troubleshooting...
diff --git a/doc/neps/nep-0040-legacy-datatype-impl.rst b/doc/neps/nep-0040-legacy-datatype-impl.rst
index c9a4fd7c8..c247e3d62 100644
--- a/doc/neps/nep-0040-legacy-datatype-impl.rst
+++ b/doc/neps/nep-0040-legacy-datatype-impl.rst
@@ -16,7 +16,8 @@ NEP 40 — Legacy Datatype Implementation in NumPy
     This NEP is part of a series of NEPs encompassing first information
     about the previous dtype implementation and issues with it in NEP 40
     (this document).
-    `NEP 41 <NEP41>`_ then provides an overview and generic design choices for the refactor.
+    :ref:`NEP 41 <NEP41>` then provides an overview and generic design choices
+    for the refactor.
     Further NEPs 42 and 43 go into the technical details of the datatype
     and universal function related internal and external API changes.
     In some cases it may be necessary to consult the other NEPs for a full
@@ -31,7 +32,7 @@ As a preparation to further NumPy enhancement proposals 41, 42, and 43. This
 NEP details the current status of NumPy datatypes as of NumPy 1.18.
 It describes some of the technical aspects and concepts that
 motivated the other proposals.
-For more general information most readers should begin by reading `NEP 41 <NEP41>`_
+For more general information most readers should begin by reading :ref:`NEP 41 <NEP41>`
 and use this document only as a reference or for additional details.
 
 
diff --git a/doc/neps/nep-0041-improved-dtype-support.rst b/doc/neps/nep-0041-improved-dtype-support.rst
index 6310741e0..294f52746 100644
--- a/doc/neps/nep-0041-improved-dtype-support.rst
+++ b/doc/neps/nep-0041-improved-dtype-support.rst
@@ -8,16 +8,16 @@ NEP 41 — First step towards a new Datatype System
 :Author: Sebastian Berg
 :Author: Stéfan van der Walt
 :Author: Matti Picus
-:Status: Draft
+:Status: Accepted
 :Type: Standard Track
 :Created: 2020-02-03
-
+:Resolution: https://mail.python.org/pipermail/numpy-discussion/2020-April/080573.html and https://mail.python.org/pipermail/numpy-discussion/2020-March/080495.html
 
 .. note::
 
     This NEP is part of a series of NEPs encompassing first information
     about the previous dtype implementation and issues with it in
-    `NEP 40 <NEP40>`_.
+    :ref:`NEP 40 <NEP40>`.
     NEP 41 (this document) then provides an overview and generic design
     choices for the refactor.
     Further NEPs 42 and 43 go into the technical details of the datatype
@@ -29,7 +29,7 @@ NEP 41 — First step towards a new Datatype System
 Abstract
 --------
 
-`Datatypes <data-type-objects-dtype>` in NumPy describe how to interpret each
+:ref:`Datatypes <arrays.dtypes>` in NumPy describe how to interpret each
 element in arrays. NumPy provides ``int``, ``float``, and ``complex`` numerical
 types, as well as string, datetime, and structured datatype capabilities.
 The growing Python community, however, has need for more diverse datatypes.
@@ -61,7 +61,8 @@ Motivation
 
 One of the main issues with the current API is the definition of typical
 functions such as addition and multiplication for parametric datatypes
-(see also `NEP 40 <NEP40>`_) which require additional steps to determine the output type.
+(see also :ref:`NEP 40 <NEP40>`)
+which require additional steps to determine the output type.
 For example when adding two strings of length 4, the result is a string
 of length 8, which is different from the input.
 Similarly, a datatype which embeds a physical unit must calculate the new unit
@@ -144,7 +145,8 @@ in more details in the detailed description section:
    Storage information such as itemsize and byteorder can differ between
    different dtype instances (e.g. "S3" vs. "S8") and will remain part of the instance.
    This means that in the long run the current lowlevel access to dtype methods
-   will be removed (see ``PyArray_ArrFuncs`` in `NEP 40 <NEP40>`_).
+   will be removed (see ``PyArray_ArrFuncs`` in
+   :ref:`NEP 40 <NEP40>`).
 
 2. The current NumPy scalars will *not* change, they will not be instances of
    datatypes. This will also be true for new datatypes, scalars will not be
@@ -225,7 +227,7 @@ Simple Numerical Types
 """"""""""""""""""""""
 
 Mainly used where memory is a consideration, lower-precision numeric types
-such as :ref:```bfloat16`` <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`
+such as `bfloat16 <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_
 are common in other computational frameworks.
 For these types the definitions of things such as ``np.common_type`` and
 ``np.can_cast`` are some of the most important interfaces. Once they
@@ -583,7 +585,8 @@ Organizing these methods and information in a more Pythonic way provides a
 solid foundation for refining and extending the API in the future.
 The current API cannot be extended due to how it is exposed publically.
 This means for example that the methods currently stored in ``PyArray_ArrFuncs``
-on each datatype (see `NEP 40 <NEP40>`_) will be defined differently in the future and
+on each datatype (see :ref:`NEP 40 <NEP40>`)
+will be defined differently in the future and
 deprecated in the long run.
 
 The most prominent visible side effect of this will be that
@@ -601,12 +604,22 @@ Inheritance, however, appears problematic and a complexity best avoided
 Further, subclasses may be more interesting for interoperability for
 example with GPU backends (CuPy) storing additional methods related to the
 GPU rather than as a mechanism to define new datatypes.
-A class hierarchy does provides value, this may be achieved by
+A class hierarchy does provides value, and one can be achieved by
 allowing the creation of *abstract* datatypes.
 An example for an abstract datatype would be the datatype equivalent of
 ``np.floating``, representing any floating point number.
 These can serve the same purpose as Python's abstract base classes.
 
+This NEP chooses to duplicate the scalar hierarchy fully or in part.
+The main reason is to uncouple the implementation of the DType and scalar.
+To add a DType to NumPy, in theory the scalar will not need to be
+modified or know about NumPy. Also note that the categorical DType as
+currently implemented in pandas does not have a scalar correspondence
+making it less straight forward to rely on scalars to implement behaviour.
+While DType and Scalar describe the same concept/type (e.g. an `int64`),
+it seems practical to split out the information and functionality necessary
+for numpy into the DType class.
+
 
 Scalars should not be instances of the datatypes (2)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -684,8 +697,9 @@ C-API Changes to the UFunc Machinery (4)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Proposed changes to the UFunc machinery will be part of NEP 43.
-However, the following changes will be necessary (see `NEP 40 <NEP40>`_ for a detailed
-description of the current implementation and its issues):
+However, the following changes will be necessary
+(see :ref:`NEP 40 <NEP40>`
+for a detailed description of the current implementation and its issues):
 
 * The current UFunc type resolution must be adapted to allow better control
   for user-defined dtypes as well as resolve current inconsistencies.
@@ -704,7 +718,15 @@ functions after handling the unit related computations.
 Discussion
 ----------
 
-See `NEP 40 <NEP40>`_ for a list of previous meetings and discussions.
+See :ref:`NEP 40 <NEP40>`
+for a list of previous meetings and discussions.
+
+Additional discussion around this specific NEP has occured on both
+the mailing list and the pull request:
+
+* `Mailing list discussion <https://mail.python.org/pipermail/numpy-discussion/2020-March/080481.html>`_
+* `NEP 41 pull request <https://github.com/numpy/numpy/pull/15506>`_
+* `Pull request thread on Dtype hierarchy and Scalars <https://github.com/numpy/numpy/pull/15506#discussion_r390016298>`_
 
 
 References
diff --git a/doc/release/13421.improvement.rst b/doc/release/upcoming_changes/13421.improvement.rst
index 6d4573aa3..6d4573aa3 100644
--- a/doc/release/13421.improvement.rst
+++ b/doc/release/upcoming_changes/13421.improvement.rst
diff --git a/doc/release/upcoming_changes/14924.compatibility.rst b/doc/release/upcoming_changes/14924.compatibility.rst
new file mode 100644
index 000000000..8b42437fd
--- /dev/null
+++ b/doc/release/upcoming_changes/14924.compatibility.rst
@@ -0,0 +1,7 @@
+Changed random variate stream from `numpy.random.Generator.dirichlet`
+---------------------------------------------------------------------
+A bug in the generation of random variates for the Dirichlet distribution
+with small `alpha` values was fixed by using a different algorithm when
+``max(alpha) < 0.1``.  Because of the change, the stream of variates
+generated by `dirichlet` in this case will be different from previous
+releases.
diff --git a/doc/release/upcoming_changes/15715.new_feature.rst b/doc/release/upcoming_changes/15715.new_feature.rst
new file mode 100644
index 000000000..a568aee64
--- /dev/null
+++ b/doc/release/upcoming_changes/15715.new_feature.rst
@@ -0,0 +1,5 @@
+`numpy.linalg.multi_dot` now accepts an ``out`` argument
+--------------------------------------------------------
+
+``out`` can be used to avoid creating unnecessary copies of the final product
+computed by `numpy.linalg.multidot`.
diff --git a/doc/release/upcoming_changes/15769.improvement.rst b/doc/release/upcoming_changes/15769.improvement.rst
new file mode 100644
index 000000000..3f70058f6
--- /dev/null
+++ b/doc/release/upcoming_changes/15769.improvement.rst
@@ -0,0 +1,15 @@
+Ability to disable madvise hugepages
+------------------------------------
+
+On Linux NumPy has previously added support for madavise
+hugepages which can improve performance for very large arrays.
+Unfortunately, on older Kernel versions this led to peformance
+regressions, thus by default the support has been disabled on
+kernels before version 4.6. To override the default, you can
+use the environment variable::
+
+    NUMPY_MADVISE_HUGEPAGE=0
+
+or set it to 1 to force enabling support. Note that this only makes
+a difference if the operating system is set up to use madvise
+transparent hugepage.
diff --git a/doc/release/upcoming_changes/16068.compatibility.rst b/doc/release/upcoming_changes/16068.compatibility.rst
new file mode 100644
index 000000000..6fe693386
--- /dev/null
+++ b/doc/release/upcoming_changes/16068.compatibility.rst
@@ -0,0 +1,11 @@
+C-Level string to datetime casts changed
+----------------------------------------
+The C-level casts from strings were simplified. This changed
+also fixes string to datetime and timedelta casts to behave
+correctly (i.e. like Python casts using ``string_arr.astype("M8")``
+while previously the cast would behave like
+``string_arr.astype(np.int_).astype("M8")``.
+This only affects code using low-level C-API to do manual casts
+(not full array casts) of single scalar values or using e.g.
+``PyArray_GetCastFunc``, and should thus not affect the vast majority
+of users.
diff --git a/doc/release/upcoming_changes/16128.new_feature.rst b/doc/release/upcoming_changes/16128.new_feature.rst
new file mode 100644
index 000000000..03b061c07
--- /dev/null
+++ b/doc/release/upcoming_changes/16128.new_feature.rst
@@ -0,0 +1,6 @@
+``equal_nan`` parameter for `numpy.array_equal`
+------------------------------------------------
+The keyword argument ``equal_nan`` was added to `numpy.array_equal`.
+``equal_nan`` is a boolean value that toggles whether or not ``nan`` values 
+are considered equal in comparison (default is ``False``). This matches API 
+used in related functions such as `numpy.isclose` and `numpy.allclose`.
diff --git a/doc/source/contents.rst b/doc/source/contents.rst
index c89153aa7..baea7784c 100644
--- a/doc/source/contents.rst
+++ b/doc/source/contents.rst
@@ -1,8 +1,8 @@
-.. _manual:
+.. _numpy_docs_mainpage:
 
-#####################
-NumPy manual contents
-#####################
+###################
+NumPy Documentation
+###################
 
 .. toctree::
 
diff --git a/doc/source/dev/conduct/code_of_conduct.rst b/doc/source/dev/conduct/code_of_conduct.rst
index aca39d8a7..f2f0a536d 100644
--- a/doc/source/dev/conduct/code_of_conduct.rst
+++ b/doc/source/dev/conduct/code_of_conduct.rst
@@ -113,8 +113,8 @@ You can report issues to the NumPy Code of Conduct committee, at
 numpy-conduct@googlegroups.com. Currently, the committee consists of:
 
 - Stefan van der Walt
-- Nathaniel J. Smith
-- Ralf Gommers
+- Melissa Weber Mendonça
+- Anirudh Subramanian
 
 If your report involves any members of the committee, or if they feel they have
 a conflict of interest in handling it, then they will recuse themselves from
diff --git a/doc/source/dev/governance/people.rst b/doc/source/dev/governance/people.rst
index 10af7f221..db2704d09 100644
--- a/doc/source/dev/governance/people.rst
+++ b/doc/source/dev/governance/people.rst
@@ -56,7 +56,7 @@ NumFOCUS Subcommittee
 Institutional Partners
 ----------------------
 
-* UC Berkeley (Stefan van der Walt, Matti Picus, Tyler Reddy, Sebastian Berg)
+* UC Berkeley (Stefan van der Walt, Sebastian Berg, Warren Weckesser, Ross Barnowski)
 
-* Quansight (Ralf Gommers, Hameer Abbasi)
+* Quansight (Ralf Gommers, Hameer Abbasi, Melissa Weber Mendonça, Mars Lee, Matti Picus)
 
diff --git a/doc/source/dev/howto-docs.rst b/doc/source/dev/howto-docs.rst
new file mode 100644
index 000000000..0e2e03e0b
--- /dev/null
+++ b/doc/source/dev/howto-docs.rst
@@ -0,0 +1,149 @@
+.. _howto-docs:
+
+############################################
+How to contribute to the NumPy documentation
+############################################
+
+The *Documentation* for a software project is the set of reference,
+instructional, educational, informative material generated by the project
+developers and contributors, as well as discussions, presentations, videos and
+other user-generated content. It may include learning-oriented content (such as
+tutorials and how-tos), use-cases or in-depth explanations and reference for
+developers.
+
+If you're reading this page, you probably want to help. This guide is meant to
+help you decide which kind of content you'll write, as well as give you some
+tips and instructions for submitting it to the official NumPy documentation
+(that is, the documentation that ships with NumPy and lives on the
+:ref:`official project pages <numpy_docs_mainpage>`). Keep in mind that if you
+don't want to do this, writing a tutorial on your own blog, creating a YouTube
+video or answering questions on social media or Stack Overflow are also great
+contributions!
+
+NumPy has a Documentation Team. We have open meetings on Zoom every three weeks
+and invite everyone to join. Don't hesitate to reach out if you have questions
+or just need someone to guide you through your first steps - we're always happy
+to help. Meetings are usually announced on the `numpy-discussion mailing list
+<https://mail.python.org/mailman/listinfo/numpy-discussion>`__. Meeting minutes
+are taken `on hackmd.io <https://hackmd.io/oB_boakvRqKR-_2jRV-Qjg>`__ and stored
+in the `NumPy Archive repository <https://github.com/numpy/archive>`__.
+
+You can find larger planned and in-progress documentation improvement ideas `at
+our GitHub project <https://github.com/orgs/numpy/projects/2>`__.
+
+Current vision for the documentation: NEP 44
+--------------------------------------------
+
+Recently, the NumPy community approved a *NumPy Enhancement Proposal (NEP)*
+about documentation, `NEP 44 - Restructuring the NumPy Documentation
+<https://www.numpy.org/neps/nep-0044-restructuring-numpy-docs>`__.
+
+**Where is the documentation?**
+
+The main page for the :ref:`NumPy Documentation <numpy_docs_mainpage>` lists
+several categories. The documents mentioned there live in different places.
+
+- **Tutorials, How-Tos, Explanations:** These documents are stored in the NumPy
+  source code tree, which means that in order to add them to the official
+  documentation, you have to download the NumPy source code,
+  :ref:`build it <howto-build-docs>` and submit your changes via a
+  :ref:`GitHub pull request <devindex>`.
+
+- **API Reference:** These are mainly the result of rendering the NumPy code
+  `docstrings <https://www.python.org/dev/peps/pep-0257/>`__ into formatted
+  documents. They are automatically generated when the NumPy documentation is
+  :ref:`built from source<howto-build-docs>`.
+
+**Datasets**
+
+If you are writing a tutorial or how-to, we encourage you to use real images and
+data (provided they are appropriately licensed and available). This makes the
+material more engaging for readers, and choosing the right data can add
+pedagogical value to your content.
+
+*Note: currently we cannot easily use data in other packages (except, e.g., from
+SciPy or Matplotlib). We plan to create a dedicated datasets package, but that's
+not ready yet - please discuss with us if you have data sources in mind.*
+
+Creating new content
+--------------------
+
+The documentation is written in restructuredText, which is the format required
+by Sphinx, the tool most Python projects use to automatically build and link the
+documentation within the project. You can read the
+`Quick reStructuredText Guide
+<https://docutils.sourceforge.io/docs/user/rst/quickref.html>`__ or the
+`reStructuredText Primer
+<http://www.sphinx-doc.org/en/stable/usage/restructuredtext/basics.html>`__ for
+more information.
+
+If you have already decided which type of document you want to write, you can
+check out the following specific guides:
+
+- Guide to writing Tutorials (TODO)
+- :ref:`Guide to writing reference (API) documentation: the numpydoc docstring
+  guide <howto-document>`
+
+Major additions to the documentation (e.g. new tutorials) should be proposed to
+the `mailing list
+<https://mail.python.org/mailman/listinfo/numpy-discussion>`__.
+  
+Other ways to contribute
+------------------------
+
+Correcting technical inaccuracies in the documentation are high priority. For
+example, if a docstring is missing a parameter or the description of a
+fuction/parameter/method etc. is incorrect. Other "structural" defects like
+broken links are also high priority.
+
+Proposals for changes that improve the clarity of the documentation are welcome.
+However, "clarity" is a bit subjective, so such proposals are best made by
+raising issues that describe what could be improved in the current
+documentation. Proposals that include specific suggestions for the improvement
+are encouraged as the proposed changes helps frame the discussion.
+
+Based on the above characterization, "high-priority" changes (i.e. fixing
+technical inaccuracies, broken links, etc.) can be proposed via pull requests
+directly as they are straightforward to review. Other changes should be raised
+as issues first so that the discussion can happen before you make major
+modifications, which in principle saves you from wasting your time on
+undesired changes.
+
+If you see a good tutorial, how-to or explanation that is not included in the
+official documentation, you can suggest it to be added by `opening an issue on
+GitHub <https://github.com/numpy/numpy/issues>`__. Similarly, opening issues to
+suggest a tutorial, how-to or explanation that you can't find anywhere is a
+great way to help the documentation team direct efforts towards what users are
+looking for. `See this issue <https://github.com/numpy/numpy/issues/15760>`__
+for an example of how to do this.
+
+Finally, if you detect a typo or an error in the documentation, or would like to
+suggest a different approach, you can also open an issue or submit a pull
+request with your suggestion. Keep in mind that changes fixing
+grammatical/spelling errors are welcome but not necessarily the highest
+priority. "Grammatical correctness" often gets confused with "style" which can
+result in unfruitful discussions that don't necessarily improve anything.
+Changes that modify wording or rearrange phrasing without changing the technical
+content are discouraged. If you think that a different wording improves clarity,
+you should open an issue as noted above, but again, changes along these lines
+very often tend to be highly subjective and not necessarily do much to improve
+the quality of the documentation.
+
+**Final tips**
+
+- Don't worry if English is not your first language. Do your best - we'll revise
+  your content and make sure we fix any issues with the code or text.
+- If you are unsure whether your tutorial is useful to the community, consider
+  submitting an issue on GitHub suggesting it, or asking on the mailing
+  list or Stack Overflow.
+- If you are unfamiliar with git/GitHub or the process of submitting a pull
+  request (PR), check our :ref:`Contributor Guide <devindex>`.
+
+**Other interesting material**
+
+- `writethedocs.org <https://www.writethedocs.org/>`__ has a lot of interesting
+  resources for technical writing.
+- Google offers two free `Technical Writing Courses
+  <https://developers.google.com/tech-writing>`__
+- `Software Carpentry <https://software-carpentry.org/software>`__ has a lot of
+  nice recommendations for creating educational material.
diff --git a/doc/source/dev/index.rst b/doc/source/dev/index.rst
index dc6099992..5270bfb77 100644
--- a/doc/source/dev/index.rst
+++ b/doc/source/dev/index.rst
@@ -1,3 +1,5 @@
+.. _devindex:
+
 #####################
 Contributing to NumPy
 #####################
@@ -284,6 +286,7 @@ The rest of the story
    style_guide
    releasing
    governance/index
+   howto-docs
 
 NumPy-specific workflow is in :ref:`numpy-development-workflow
 <development-workflow>`.
diff --git a/doc/source/reference/global_state.rst b/doc/source/reference/global_state.rst
new file mode 100644
index 000000000..2a163390e
--- /dev/null
+++ b/doc/source/reference/global_state.rst
@@ -0,0 +1,85 @@
+.. _global_state:
+
+************
+Global State
+************
+
+NumPy has a few import-time, compile-time, or runtime options
+which change the global behaviour.
+Most of these are related to performance or for debugging
+purposes and will not be interesting to the vast majority
+of users.
+
+
+Performance-Related Options
+===========================
+
+Number of Threads used for Linear Algebra
+-----------------------------------------
+
+NumPy itself is normally intentionally limited to a single thread
+during function calls, however it does support multiple Python
+threads running at the same time.
+Note that for performant linear algebra NumPy uses a BLAS backend
+such as OpenBLAS or MKL, which may use multiple threads that may
+be controlled by environment variables such as ``OMP_NUM_THREADS``
+depending on what is used.
+One way to control the number of threads is the package
+`threadpoolctl <https://pypi.org/project/threadpoolctl/>`_
+
+
+Madvise Hugepage on Linux
+-------------------------
+
+When working with very large arrays on modern Linux kernels,
+you can experience a significant speedup when
+`transparent hugepage <https://www.kernel.org/doc/html/latest/admin-guide/mm/transhuge.html>`_
+is used.
+The current system policy for transparent hugepages can be seen by::
+
+    cat /sys/kernel/mm/transparent_hugepage/enabled
+
+When set to ``madvise`` NumPy will typically use hugepages for a performance
+boost. This behaviour can be modified by setting the environment variable::
+
+    NUMPY_MADVISE_HUGEPAGE=0
+
+or setting it to ``1`` to always enable it. When not set, the default
+is to use madvise on Kernels 4.6 and newer. These kernels presumably
+experience a large speedup with hugepage support.
+This flag is checked at import time.
+
+
+Interoperability-Related Options
+================================
+
+The array function protocol which allows array-like objects to
+hook into the NumPy API is currently enabled by default.
+This option exists since NumPy 1.16 and is enabled by default since
+NumPy 1.17. It can be disabled using::
+
+    NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=0
+
+See also :py:meth:`numpy.class.__array_function__` for more information.
+This flag is checked at import time.
+
+
+Debugging-Related Options
+=========================
+
+Relaxed Strides Checking
+------------------------
+
+The *compile-time* environment variables::
+
+    NPY_RELAXED_STRIDES_DEBUG=0
+    NPY_RELAXED_STRIDES_CHECKING=1
+
+control how NumPy reports contiguity for arrays.
+The default that it is enabled and the debug mode is disabled.
+This setting should always be enabled. Setting the
+debug option can be interesting for testing code written
+in C which iterates through arrays that may or may not be
+contiguous in memory.
+Most users will have no reason to change these, for details
+please see the `memory layout <memory-layout>`_ documentation.
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index d042a1849..2e1dcafa2 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -12,7 +12,7 @@ NumPy Reference
 
 This reference manual details functions, modules, and objects
 included in NumPy, describing what they are and what they do.
-For learning how to use NumPy, see the :ref:`complete documentation <manual>`.
+For learning how to use NumPy, see the :ref:`complete documentation <numpy_docs_mainpage>`.
 
 
 .. toctree::
@@ -22,6 +22,7 @@ For learning how to use NumPy, see the :ref:`complete documentation <manual>`.
    constants
    ufuncs
    routines
+   global_state
    distutils
    distutils_guide
    c-api/index
diff --git a/doc/source/reference/random/index.rst b/doc/source/reference/random/index.rst
index bda9c4d96..d559f2327 100644
--- a/doc/source/reference/random/index.rst
+++ b/doc/source/reference/random/index.rst
@@ -168,6 +168,9 @@ What's New or Different
   Python's `random.random`.
 * All BitGenerators in numpy use `SeedSequence` to convert seeds into
   initialized states.
+* The addition of an ``axis`` keyword argument to methods such as 
+  `Generator.choice`, `Generator.permutation`,  and `Generator.shuffle` 
+  improves support for sampling from and shuffling multi-dimensional arrays.
 
 See :ref:`new-or-different` for a complete list of improvements and
 differences from the traditional ``Randomstate``.
diff --git a/doc/source/reference/random/new-or-different.rst b/doc/source/reference/random/new-or-different.rst
index 1d6b09faf..03e7775a0 100644
--- a/doc/source/reference/random/new-or-different.rst
+++ b/doc/source/reference/random/new-or-different.rst
@@ -115,3 +115,15 @@ And in more detail:
   rg.random(out=existing[:2])
   print(existing)
 
+* Optional ``axis`` argument for methods like `~.Generator.choice`,
+  `~.Generator.permutation` and `~.Generator.shuffle` that controls which
+  axis an operation is performed over for multi-dimensional arrays.
+
+.. ipython:: python
+
+  rg = Generator(PCG64(123456789))
+  a = np.arange(12).reshape((3, 4))
+  a
+  rg.choice(a, axis=1, size=5)
+  rg.shuffle(a, axis=1)        # Shuffle in-place
+  a
diff --git a/doc/source/release.rst b/doc/source/release.rst
index 0a1402bf7..1e5697828 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -6,6 +6,7 @@ Release Notes
     :maxdepth: 3
 
     1.19.0 <release/1.19.0-notes>
+    1.18.4 <release/1.18.4-notes>
     1.18.3 <release/1.18.3-notes>
     1.18.2 <release/1.18.2-notes>
     1.18.1 <release/1.18.1-notes>
diff --git a/doc/source/release/1.18.4-notes.rst b/doc/source/release/1.18.4-notes.rst
new file mode 100644
index 000000000..25ef1d127
--- /dev/null
+++ b/doc/source/release/1.18.4-notes.rst
@@ -0,0 +1,38 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.18.4 Release Notes
+==========================
+
+This is the last planned release in the 1.18.x series. It reverts the
+``bool("0")`` behavior introduced in 1.18.3 and fixes a bug in
+``Generator.integers``. There is also a link to a new troubleshooting section
+in the documentation included in the error message emitted when numpy import
+fails.
+
+The Python versions supported in this release are 3.5-3.8. Downstream
+developers should use Cython >= 0.29.15 for Python 3.8 support and
+OpenBLAS >= 3.7 to avoid errors on the Skylake architecture.
+
+Contributors
+============
+
+A total of 4 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Charles Harris
+* Matti Picus
+* Sebastian Berg
+* Warren Weckesser
+
+Pull requests merged
+====================
+
+A total of 6 pull requests were merged for this release.
+
+* `#16055 <https://github.com/numpy/numpy/pull/16055>`__: BLD: add i686 for 1.18 builds
+* `#16090 <https://github.com/numpy/numpy/pull/16090>`__: BUG: random: ``Generator.integers(2**32)`` always returned 0.
+* `#16091 <https://github.com/numpy/numpy/pull/16091>`__: BLD: fix path to libgfortran on macOS
+* `#16109 <https://github.com/numpy/numpy/pull/16109>`__: REV: Reverts side-effect changes to casting
+* `#16114 <https://github.com/numpy/numpy/pull/16114>`__: BLD: put openblas library in local directory on windows
+* `#16132 <https://github.com/numpy/numpy/pull/16132>`__: DOC: Change import error "howto" to link to new troubleshooting...
diff --git a/doc/source/user/quickstart.rst b/doc/source/user/quickstart.rst
index 6ad0e52f9..29a4d80ca 100644
--- a/doc/source/user/quickstart.rst
+++ b/doc/source/user/quickstart.rst
@@ -760,7 +760,7 @@ stacks 1D arrays as columns into a 2D array. It is equivalent to
            [2., 8.]])
     >>> np.hstack((a,b))           # the result is different
     array([4., 2., 3., 8.])
-    >>> a[:,newaxis]               # this allows to have a 2D columns vector
+    >>> a[:,newaxis]               # view `a` as a 2D column vector
     array([[4.],
            [2.]])
     >>> np.column_stack((a[:,newaxis],b[:,newaxis]))
diff --git a/doc/source/user/setting-up.rst b/doc/source/user/setting-up.rst
index f70dacf82..7ca3a365c 100644
--- a/doc/source/user/setting-up.rst
+++ b/doc/source/user/setting-up.rst
@@ -7,3 +7,4 @@ Setting up
 
    whatisnumpy
    install
+   troubleshooting-importerror
diff --git a/doc/source/user/troubleshooting-importerror.rst b/doc/source/user/troubleshooting-importerror.rst
new file mode 100644
index 000000000..131e698dd
--- /dev/null
+++ b/doc/source/user/troubleshooting-importerror.rst
@@ -0,0 +1,134 @@
+***************************
+Troubleshooting ImportError
+***************************
+
+.. note::
+
+    Since this information may be updated regularly, please ensure you are
+    viewing the most `up-to-date version <https://numpy.org/devdocs/user/troubleshooting-importerror.html>`_.
+
+
+ImportError
+===========
+
+In certain cases a failed installation or setup issue can cause you to
+see the following error message::
+
+    IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!
+
+    Importing the numpy c-extensions failed. This error can happen for
+    different reasons, often due to issues with your setup.
+
+The error also has additional information to help you troubleshoot:
+
+* Your Python version
+* Your NumPy version
+
+Please check both of these carefully to see if they are what you expect.
+You may need to check your ``PATH`` or ``PYTHONPATH`` environment variables
+(see `Check Environment Variables`_ below).
+
+The following sections list commonly reported issues depending on your setup.
+If you have an issue/solution that you think should appear please open a
+NumPy issue so that it will be added.
+
+There are a few commonly reported issues depending on your system/setup.
+If none of the following tips help you, please be sure to note the following:
+
+* how you installed Python
+* how you installed NumPy
+* your operating system
+* whether or not you have multiple versions of Python installed
+* if you built from source, your compiler versions and ideally a build log
+
+when investigating further and asking for support.
+
+
+Using Python from ``conda`` (Anaconda)
+--------------------------------------
+
+Please make sure that you have activated your conda environment.
+See also the `conda user-guide <https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#activating-an-environment>`_.
+If you use an external editor/development environment it will have to be set
+up correctly.  See below for solutions for some common setups.
+
+Using PyCharm with Anaconda/conda Python
+----------------------------------------
+
+There are fairly common issues when using PyCharm together with Anaconda,
+please see the `PyCharm support <https://www.jetbrains.com/help/pycharm/conda-support-creating-conda-virtual-environment.html>`_
+
+Using VSCode with Anaconda/conda Python (or environments)
+---------------------------------------------------------
+
+A commonly reported issue is related to the environment activation within
+VSCode. Please see the `VSCode support <https://code.visualstudio.com/docs/python/environments>`_
+for information on how to correctly set up VSCode with virtual environments
+or conda.
+
+
+Raspberry Pi
+------------
+
+There are sometimes issues reported on Raspberry Pi setups when installing
+using ``pip3 install`` (or ``pip`` install). These will typically mention::
+
+    libf77blas.so.3: cannot open shared object file: No such file or directory
+
+
+The solution will be to either::
+
+    sudo apt-get install libatlas-base-dev
+
+to install the missing libraries expected by the self-compiled NumPy
+(ATLAS is a possible provider of linear algebra).
+
+*Alternatively* use the NumPy provided by Raspbian. In which case run::
+
+    pip3 uninstall numpy  # remove previously installed version
+    apt install python3-numpy
+
+
+Debug build on Windows
+----------------------
+
+Rather than building your project in ``DEBUG`` mode on windows, try
+building in ``RELEASE`` mode with debug symbols and no optimization.
+Full ``DEBUG`` mode on windows changes the names of the DLLs python
+expects to find, so if you wish to truly work in ``DEBUG`` mode you will
+need to recompile the entire stack of python modules you work with
+including NumPy
+
+
+All Setups
+----------
+
+Occasionally there may be simple issues with old or bad installations
+of NumPy. In this case you may just try to uninstall and reinstall NumPy.
+Make sure that NumPy is not found after uninstalling.
+
+
+Development Setup
+-----------------
+
+If you are using a development setup, make sure to run ``git clean -xdf``
+to delete all files not under version control (be careful not to lose
+any modifications you made, e.g. ``site.cfg``).
+In many cases files from old builds may lead to incorrect builds.
+
+
+Check Environment Variables
+---------------------------
+
+In general how to set and check your environment variables depends on
+your system. If you can open a correct python shell, you can also run the
+following in python::
+
+    import os
+    PYTHONPATH = os.environ['PYTHONPATH'].split(os.pathsep)
+    print("The PYTHONPATH is:", PYTHONPATH)
+    PATH = os.environ['PATH'].split(os.pathsep)
+    print("The PATH is:", PATH)
+
+This may mainly help you if you are not running the python and/or NumPy
+version you are expecting to run.
diff --git a/doc_requirements.txt b/doc_requirements.txt
index ce1c33b17..52b51cd6e 100644
--- a/doc_requirements.txt
+++ b/doc_requirements.txt
@@ -1,4 +1,4 @@
-sphinx>=2.2.0
+sphinx>=2.2.0,<3.0
 ipython
 scipy
 matplotlib
diff --git a/numpy/__init__.pxd b/numpy/__init__.pxd
index 23bd22e36..7fc3d61e7 100644
--- a/numpy/__init__.pxd
+++ b/numpy/__init__.pxd
@@ -254,20 +254,8 @@ cdef extern from "numpy/arrayobject.h":
             npy_intp *shape "dimensions"
             npy_intp *strides
             dtype descr  # deprecated since NumPy 1.7 !
-            PyObject* base
+            PyObject* base #  NOT PUBLIC, DO NOT USE !
 
-        # Note: This syntax (function definition in pxd files) is an
-        # experimental exception made for __getbuffer__ and __releasebuffer__
-        # -- the details of this may change.
-        def __getbuffer__(ndarray self, Py_buffer* info, int flags):
-            PyObject_GetBuffer(<object>self, info, flags);
-
-        def __releasebuffer__(ndarray self, Py_buffer* info):
-            # We should call a possible tp_bufferrelease(self, info) but no
-            # interface to that is exposed by cython or python. And currently
-            # the function is NULL in numpy, we rely on refcounting to release
-            # info when self is collected
-            pass
 
 
     ctypedef unsigned char      npy_bool
@@ -345,103 +333,107 @@ cdef extern from "numpy/arrayobject.h":
         int len
 
     int _import_array() except -1
+    # A second definition so _import_array isn't marked as used when we use it here.
+    # Do not use - subject to change any time.
+    int __pyx_import_array "_import_array"() except -1
 
     #
     # Macros from ndarrayobject.h
     #
-    bint PyArray_CHKFLAGS(ndarray m, int flags)
-    bint PyArray_IS_C_CONTIGUOUS(ndarray arr)
-    bint PyArray_IS_F_CONTIGUOUS(ndarray arr)
-    bint PyArray_ISCONTIGUOUS(ndarray m)
-    bint PyArray_ISWRITEABLE(ndarray m)
-    bint PyArray_ISALIGNED(ndarray m)
-
-    int PyArray_NDIM(ndarray)
-    bint PyArray_ISONESEGMENT(ndarray)
-    bint PyArray_ISFORTRAN(ndarray)
-    int PyArray_FORTRANIF(ndarray)
-
-    void* PyArray_DATA(ndarray)
-    char* PyArray_BYTES(ndarray)
-    npy_intp* PyArray_DIMS(ndarray)
-    npy_intp* PyArray_STRIDES(ndarray)
-    npy_intp PyArray_DIM(ndarray, size_t)
-    npy_intp PyArray_STRIDE(ndarray, size_t)
-
-    PyObject *PyArray_BASE(ndarray)  # returns borrowed reference!
-    PyArray_Descr *PyArray_DESCR(ndarray) # returns borrowed reference to dtype!
-    int PyArray_FLAGS(ndarray)
-    npy_intp PyArray_ITEMSIZE(ndarray)
-    int PyArray_TYPE(ndarray arr)
+    bint PyArray_CHKFLAGS(ndarray m, int flags) nogil
+    bint PyArray_IS_C_CONTIGUOUS(ndarray arr) nogil
+    bint PyArray_IS_F_CONTIGUOUS(ndarray arr) nogil
+    bint PyArray_ISCONTIGUOUS(ndarray m) nogil
+    bint PyArray_ISWRITEABLE(ndarray m) nogil
+    bint PyArray_ISALIGNED(ndarray m) nogil
+
+    int PyArray_NDIM(ndarray) nogil
+    bint PyArray_ISONESEGMENT(ndarray) nogil
+    bint PyArray_ISFORTRAN(ndarray) nogil
+    int PyArray_FORTRANIF(ndarray) nogil
+
+    void* PyArray_DATA(ndarray) nogil
+    char* PyArray_BYTES(ndarray) nogil
+
+    npy_intp* PyArray_DIMS(ndarray) nogil
+    npy_intp* PyArray_STRIDES(ndarray) nogil
+    npy_intp PyArray_DIM(ndarray, size_t) nogil
+    npy_intp PyArray_STRIDE(ndarray, size_t) nogil
+
+    PyObject *PyArray_BASE(ndarray) nogil  # returns borrowed reference!
+    PyArray_Descr *PyArray_DESCR(ndarray) nogil  # returns borrowed reference to dtype!
+    int PyArray_FLAGS(ndarray) nogil
+    npy_intp PyArray_ITEMSIZE(ndarray) nogil
+    int PyArray_TYPE(ndarray arr) nogil
 
     object PyArray_GETITEM(ndarray arr, void *itemptr)
     int PyArray_SETITEM(ndarray arr, void *itemptr, object obj)
 
-    bint PyTypeNum_ISBOOL(int)
-    bint PyTypeNum_ISUNSIGNED(int)
-    bint PyTypeNum_ISSIGNED(int)
-    bint PyTypeNum_ISINTEGER(int)
-    bint PyTypeNum_ISFLOAT(int)
-    bint PyTypeNum_ISNUMBER(int)
-    bint PyTypeNum_ISSTRING(int)
-    bint PyTypeNum_ISCOMPLEX(int)
-    bint PyTypeNum_ISPYTHON(int)
-    bint PyTypeNum_ISFLEXIBLE(int)
-    bint PyTypeNum_ISUSERDEF(int)
-    bint PyTypeNum_ISEXTENDED(int)
-    bint PyTypeNum_ISOBJECT(int)
-
-    bint PyDataType_ISBOOL(dtype)
-    bint PyDataType_ISUNSIGNED(dtype)
-    bint PyDataType_ISSIGNED(dtype)
-    bint PyDataType_ISINTEGER(dtype)
-    bint PyDataType_ISFLOAT(dtype)
-    bint PyDataType_ISNUMBER(dtype)
-    bint PyDataType_ISSTRING(dtype)
-    bint PyDataType_ISCOMPLEX(dtype)
-    bint PyDataType_ISPYTHON(dtype)
-    bint PyDataType_ISFLEXIBLE(dtype)
-    bint PyDataType_ISUSERDEF(dtype)
-    bint PyDataType_ISEXTENDED(dtype)
-    bint PyDataType_ISOBJECT(dtype)
-    bint PyDataType_HASFIELDS(dtype)
-    bint PyDataType_HASSUBARRAY(dtype)
-
-    bint PyArray_ISBOOL(ndarray)
-    bint PyArray_ISUNSIGNED(ndarray)
-    bint PyArray_ISSIGNED(ndarray)
-    bint PyArray_ISINTEGER(ndarray)
-    bint PyArray_ISFLOAT(ndarray)
-    bint PyArray_ISNUMBER(ndarray)
-    bint PyArray_ISSTRING(ndarray)
-    bint PyArray_ISCOMPLEX(ndarray)
-    bint PyArray_ISPYTHON(ndarray)
-    bint PyArray_ISFLEXIBLE(ndarray)
-    bint PyArray_ISUSERDEF(ndarray)
-    bint PyArray_ISEXTENDED(ndarray)
-    bint PyArray_ISOBJECT(ndarray)
-    bint PyArray_HASFIELDS(ndarray)
-
-    bint PyArray_ISVARIABLE(ndarray)
-
-    bint PyArray_SAFEALIGNEDCOPY(ndarray)
-    bint PyArray_ISNBO(char)              # works on ndarray.byteorder
-    bint PyArray_IsNativeByteOrder(char)  # works on ndarray.byteorder
-    bint PyArray_ISNOTSWAPPED(ndarray)
-    bint PyArray_ISBYTESWAPPED(ndarray)
-
-    bint PyArray_FLAGSWAP(ndarray, int)
-
-    bint PyArray_ISCARRAY(ndarray)
-    bint PyArray_ISCARRAY_RO(ndarray)
-    bint PyArray_ISFARRAY(ndarray)
-    bint PyArray_ISFARRAY_RO(ndarray)
-    bint PyArray_ISBEHAVED(ndarray)
-    bint PyArray_ISBEHAVED_RO(ndarray)
-
-
-    bint PyDataType_ISNOTSWAPPED(dtype)
-    bint PyDataType_ISBYTESWAPPED(dtype)
+    bint PyTypeNum_ISBOOL(int) nogil
+    bint PyTypeNum_ISUNSIGNED(int) nogil
+    bint PyTypeNum_ISSIGNED(int) nogil
+    bint PyTypeNum_ISINTEGER(int) nogil
+    bint PyTypeNum_ISFLOAT(int) nogil
+    bint PyTypeNum_ISNUMBER(int) nogil
+    bint PyTypeNum_ISSTRING(int) nogil
+    bint PyTypeNum_ISCOMPLEX(int) nogil
+    bint PyTypeNum_ISPYTHON(int) nogil
+    bint PyTypeNum_ISFLEXIBLE(int) nogil
+    bint PyTypeNum_ISUSERDEF(int) nogil
+    bint PyTypeNum_ISEXTENDED(int) nogil
+    bint PyTypeNum_ISOBJECT(int) nogil
+
+    bint PyDataType_ISBOOL(dtype) nogil
+    bint PyDataType_ISUNSIGNED(dtype) nogil
+    bint PyDataType_ISSIGNED(dtype) nogil
+    bint PyDataType_ISINTEGER(dtype) nogil
+    bint PyDataType_ISFLOAT(dtype) nogil
+    bint PyDataType_ISNUMBER(dtype) nogil
+    bint PyDataType_ISSTRING(dtype) nogil
+    bint PyDataType_ISCOMPLEX(dtype) nogil
+    bint PyDataType_ISPYTHON(dtype) nogil
+    bint PyDataType_ISFLEXIBLE(dtype) nogil
+    bint PyDataType_ISUSERDEF(dtype) nogil
+    bint PyDataType_ISEXTENDED(dtype) nogil
+    bint PyDataType_ISOBJECT(dtype) nogil
+    bint PyDataType_HASFIELDS(dtype) nogil
+    bint PyDataType_HASSUBARRAY(dtype) nogil
+
+    bint PyArray_ISBOOL(ndarray) nogil
+    bint PyArray_ISUNSIGNED(ndarray) nogil
+    bint PyArray_ISSIGNED(ndarray) nogil
+    bint PyArray_ISINTEGER(ndarray) nogil
+    bint PyArray_ISFLOAT(ndarray) nogil
+    bint PyArray_ISNUMBER(ndarray) nogil
+    bint PyArray_ISSTRING(ndarray) nogil
+    bint PyArray_ISCOMPLEX(ndarray) nogil
+    bint PyArray_ISPYTHON(ndarray) nogil
+    bint PyArray_ISFLEXIBLE(ndarray) nogil
+    bint PyArray_ISUSERDEF(ndarray) nogil
+    bint PyArray_ISEXTENDED(ndarray) nogil
+    bint PyArray_ISOBJECT(ndarray) nogil
+    bint PyArray_HASFIELDS(ndarray) nogil
+
+    bint PyArray_ISVARIABLE(ndarray) nogil
+
+    bint PyArray_SAFEALIGNEDCOPY(ndarray) nogil
+    bint PyArray_ISNBO(char) nogil              # works on ndarray.byteorder
+    bint PyArray_IsNativeByteOrder(char) nogil # works on ndarray.byteorder
+    bint PyArray_ISNOTSWAPPED(ndarray) nogil
+    bint PyArray_ISBYTESWAPPED(ndarray) nogil
+
+    bint PyArray_FLAGSWAP(ndarray, int) nogil
+
+    bint PyArray_ISCARRAY(ndarray) nogil
+    bint PyArray_ISCARRAY_RO(ndarray) nogil
+    bint PyArray_ISFARRAY(ndarray) nogil
+    bint PyArray_ISFARRAY_RO(ndarray) nogil
+    bint PyArray_ISBEHAVED(ndarray) nogil
+    bint PyArray_ISBEHAVED_RO(ndarray) nogil
+
+
+    bint PyDataType_ISNOTSWAPPED(dtype) nogil
+    bint PyDataType_ISBYTESWAPPED(dtype) nogil
 
     bint PyArray_DescrCheck(object)
 
@@ -461,10 +453,11 @@ cdef extern from "numpy/arrayobject.h":
     bint PyArray_IsPythonScalar(object)
     bint PyArray_IsAnyScalar(object)
     bint PyArray_CheckAnyScalar(object)
+
     ndarray PyArray_GETCONTIGUOUS(ndarray)
-    bint PyArray_SAMESHAPE(ndarray, ndarray)
-    npy_intp PyArray_SIZE(ndarray)
-    npy_intp PyArray_NBYTES(ndarray)
+    bint PyArray_SAMESHAPE(ndarray, ndarray) nogil
+    npy_intp PyArray_SIZE(ndarray) nogil
+    npy_intp PyArray_NBYTES(ndarray) nogil
 
     object PyArray_FROM_O(object)
     object PyArray_FROM_OF(object m, int flags)
@@ -477,16 +470,16 @@ cdef extern from "numpy/arrayobject.h":
     npy_intp PyArray_REFCOUNT(object)
     object PyArray_ContiguousFromAny(op, int, int min_depth, int max_depth)
     unsigned char PyArray_EquivArrTypes(ndarray a1, ndarray a2)
-    bint PyArray_EquivByteorders(int b1, int b2)
+    bint PyArray_EquivByteorders(int b1, int b2) nogil
     object PyArray_SimpleNew(int nd, npy_intp* dims, int typenum)
     object PyArray_SimpleNewFromData(int nd, npy_intp* dims, int typenum, void* data)
     #object PyArray_SimpleNewFromDescr(int nd, npy_intp* dims, dtype descr)
     object PyArray_ToScalar(void* data, ndarray arr)
 
-    void* PyArray_GETPTR1(ndarray m, npy_intp i)
-    void* PyArray_GETPTR2(ndarray m, npy_intp i, npy_intp j)
-    void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k)
-    void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l)
+    void* PyArray_GETPTR1(ndarray m, npy_intp i) nogil
+    void* PyArray_GETPTR2(ndarray m, npy_intp i, npy_intp j) nogil
+    void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k) nogil
+    void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) nogil
 
     void PyArray_XDECREF_ERR(ndarray)
     # Cannot be supported due to out arg
@@ -961,7 +954,7 @@ cdef inline object get_array_base(ndarray arr):
 # Cython code.
 cdef inline int import_array() except -1:
     try:
-        _import_array()
+        __pyx_import_array()
     except Exception:
         raise ImportError("numpy.core.multiarray failed to import")
 
@@ -976,3 +969,10 @@ cdef inline int import_ufunc() except -1:
         _import_umath()
     except Exception:
         raise ImportError("numpy.core.umath failed to import")
+
+cdef extern from *:
+    # Leave a marker that the NumPy declarations came from this file
+    # See https://github.com/cython/cython/issues/3573
+    """
+    /* NumPy API declarations from "numpy/__init__.pxd" */
+    """
diff --git a/numpy/__init__.py b/numpy/__init__.py
index 2d3423c56..575e8ea3d 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -285,3 +285,24 @@ else:
                         error_message))
                 raise RuntimeError(msg)
     del _mac_os_check
+
+    # We usually use madvise hugepages support, but on some old kernels it
+    # is slow and thus better avoided.
+    # Specifically kernel version 4.6 had a bug fix which probably fixed this:
+    # https://github.com/torvalds/linux/commit/7cf91a98e607c2f935dbcc177d70011e95b8faff
+    import os
+    use_hugepage = os.environ.get("NUMPY_MADVISE_HUGEPAGE", None)
+    if sys.platform == "linux" and use_hugepage is None:
+        use_hugepage = 1
+        kernel_version = os.uname().release.split(".")[:2]
+        kernel_version = tuple(int(v) for v in kernel_version)
+        if kernel_version < (4, 6):
+            use_hugepage = 0
+    elif use_hugepage is None:
+        # This is not Linux, so it should not matter, just enable anyway
+        use_hugepage = 1
+    else:
+        use_hugepage = int(use_hugepage)
+
+    # Note that this will currently only make a difference on Linux
+    core.multiarray._set_madvise_hugepage(use_hugepage)
diff --git a/numpy/compat/setup.py b/numpy/compat/setup.py
index afa511673..c1b34a2cc 100644
--- a/numpy/compat/setup.py
+++ b/numpy/compat/setup.py
@@ -2,7 +2,7 @@ def configuration(parent_package='',top_path=None):
     from numpy.distutils.misc_util import Configuration
 
     config = Configuration('compat', parent_package, top_path)
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
     return config
 
 if __name__ == '__main__':
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index c2d53fe3e..c77885954 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -26,25 +26,21 @@ except ImportError as exc:
 
 IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!
 
-Importing the numpy c-extensions failed.
-- Try uninstalling and reinstalling numpy.
-- If you have already done that, then:
-  1. Check that you expected to use Python%d.%d from "%s",
-     and that you have no directories in your PATH or PYTHONPATH that can
-     interfere with the Python and numpy version "%s" you're trying to use.
-  2. If (1) looks fine, you can open a new issue at
-     https://github.com/numpy/numpy/issues.  Please include details on:
-     - how you installed Python
-     - how you installed numpy
-     - your operating system
-     - whether or not you have multiple versions of Python installed
-     - if you built from source, your compiler versions and ideally a build log
-
-- If you're working with a numpy git repository, try `git clean -xdf`
-  (removes all files not under version control) and rebuild numpy.
-
-Note: this error has many possible causes, so please don't comment on
-an existing issue about this - open a new one instead.
+Importing the numpy C-extensions failed. This error can happen for
+many reasons, often due to issues with your setup or how NumPy was
+installed.
+
+We have compiled some common reasons and troubleshooting tips at:
+
+    https://numpy.org/devdocs/user/troubleshooting-importerror.html
+
+Please note and check the following:
+
+  * The Python version is: Python%d.%d from "%s"
+  * The NumPy version is: "%s"
+
+and make sure that they are the versions you expect.
+Please carefully study the documentation linked above for further help.
 
 Original error was: %s
 """ % (sys.version_info[0], sys.version_info[1], sys.executable,
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 18ab10078..f43b77c44 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -4394,6 +4394,14 @@ add_newdoc('numpy.core.umath', '_add_newdoc_ufunc',
     and then throwing away the ufunc.
     """)
 
+add_newdoc('numpy.core.multiarray', '_set_madvise_hugepage',
+    """
+    _set_madvise_hugepage(enabled: bool) -> bool
+
+    Set  or unset use of ``madvise (2)`` MADV_HUGEPAGE support when
+    allocating the array data. Returns the previously set value.
+    See `global_state` for more information.
+    """)
 
 add_newdoc('numpy.core._multiarray_tests', 'format_float_OSprintf_g',
     """
@@ -5001,7 +5009,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduceat',
     """))
 
 add_newdoc('numpy.core', 'ufunc', ('outer',
-    """
+    r"""
     outer(A, B, **kwargs)
 
     Apply the ufunc `op` to all pairs (a, b) with a in `A` and b in `B`.
@@ -5035,7 +5043,13 @@ add_newdoc('numpy.core', 'ufunc', ('outer',
 
     See Also
     --------
-    numpy.outer
+    numpy.outer : A less powerful version of ``np.multiply.outer``
+                  that `ravel`\ s all inputs to 1D. This exists
+                  primarily for compatibility with old code.
+
+    tensordot : ``np.tensordot(a, b, axes=((), ()))`` and
+                ``np.multiply.outer(a, b)`` behave same for all
+                dimensions of a and b.
 
     Examples
     --------
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 129516658..82cd6fb27 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -1835,6 +1835,17 @@ add_newdoc('numpy.core.umath', 'left_shift',
     >>> np.left_shift(5, [1,2,3])
     array([10, 20, 40])
 
+    Note that the dtype of the second argument may change the dtype of the
+    result and can lead to unexpected results in some cases (see
+    :ref:`Casting Rules <ufuncs.casting>`):
+
+    >>> a = np.left_shift(np.uint8(255), 1) # Expect 254
+    >>> print(a, type(a)) # Unexpected result due to upcasting
+    510 <class 'numpy.int64'>
+    >>> b = np.left_shift(np.uint8(255), np.uint8(1))
+    >>> print(b, type(b))
+    254 <class 'numpy.uint8'>
+
     """)
 
 add_newdoc('numpy.core.umath', 'less',
diff --git a/numpy/core/memmap.py b/numpy/core/memmap.py
index ad66446c2..cb025736e 100644
--- a/numpy/core/memmap.py
+++ b/numpy/core/memmap.py
@@ -209,10 +209,12 @@ class memmap(ndarray):
         import os.path
         try:
             mode = mode_equivalents[mode]
-        except KeyError:
+        except KeyError as e:
             if mode not in valid_filemodes:
-                raise ValueError("mode must be one of %s" %
-                                 (valid_filemodes + list(mode_equivalents.keys())))
+                raise ValueError(
+                    "mode must be one of {!r} (got {!r})"
+                    .format(valid_filemodes + list(mode_equivalents.keys()), mode)
+                ) from None
 
         if mode == 'w+' and shape is None:
             raise ValueError("shape must be given")
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index e207280f0..f3d48459a 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -17,7 +17,7 @@ from ._multiarray_umath import *  # noqa: F403
 # _get_ndarray_c_version is semi-public, on purpose not added to __all__
 from ._multiarray_umath import (
     _fastCopyAndTranspose, _flagdict, _insert, _reconstruct, _vec_string,
-    _ARRAY_API, _monotonicity, _get_ndarray_c_version
+    _ARRAY_API, _monotonicity, _get_ndarray_c_version, _set_madvise_hugepage,
     )
 
 __all__ = [
@@ -1266,7 +1266,13 @@ def shares_memory(a, b, max_work=None):
     """
     shares_memory(a, b, max_work=None)
 
-    Determine if two arrays share memory
+    Determine if two arrays share memory.
+
+    .. warning::
+
+       This function can be exponentially slow for some inputs, unless
+       `max_work` is set to a finite number or ``MAY_SHARE_BOUNDS``.
+       If in doubt, use `numpy.may_share_memory` instead.
 
     Parameters
     ----------
@@ -1279,7 +1285,8 @@ def shares_memory(a, b, max_work=None):
 
         max_work=MAY_SHARE_EXACT  (default)
             The problem is solved exactly. In this case, the function returns
-            True only if there is an element shared between the arrays.
+            True only if there is an element shared between the arrays. Finding
+            the exact solution may take extremely long in some cases.
         max_work=MAY_SHARE_BOUNDS
             Only the memory bounds of a and b are checked.
 
@@ -1298,9 +1305,33 @@ def shares_memory(a, b, max_work=None):
 
     Examples
     --------
-    >>> np.may_share_memory(np.array([1,2]), np.array([5,8,9]))
+    >>> x = np.array([1, 2, 3, 4])
+    >>> np.shares_memory(x, np.array([5, 6, 7]))
+    False
+    >>> np.shares_memory(x[::2], x)
+    True
+    >>> np.shares_memory(x[::2], x[1::2])
     False
 
+    Checking whether two arrays share memory is NP-complete, and
+    runtime may increase exponentially in the number of
+    dimensions. Hence, `max_work` should generally be set to a finite
+    number, as it is possible to construct examples that take
+    extremely long to run:
+
+    >>> from numpy.lib.stride_tricks import as_strided
+    >>> x = np.zeros([192163377], dtype=np.int8)
+    >>> x1 = as_strided(x, strides=(36674, 61119, 85569), shape=(1049, 1049, 1049))
+    >>> x2 = as_strided(x[64023025:], strides=(12223, 12224, 1), shape=(1049, 1049, 1))
+    >>> np.shares_memory(x1, x2, max_work=1000)
+    Traceback (most recent call last):
+    ...
+    numpy.TooHardError: Exceeded max_work
+
+    Running ``np.shares_memory(x1, x2)`` without `max_work` set takes
+    around 1 minute for this case. It is possible to find problems
+    that take still significantly longer.
+
     """
     return (a, b)
 
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 83d985a7c..8bd4e241b 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -857,8 +857,11 @@ def outer(a, b, out=None):
     --------
     inner
     einsum : ``einsum('i,j->ij', a.ravel(), b.ravel())`` is the equivalent.
-    ufunc.outer : A generalization to N dimensions and other operations.
-                  ``np.multiply.outer(a.ravel(), b.ravel())`` is the equivalent.
+    ufunc.outer : A generalization to dimensions other than 1D and other
+                  operations. ``np.multiply.outer(a.ravel(), b.ravel())``
+                  is the equivalent.
+    tensordot : ``np.tensordot(a.ravel(), b.ravel(), axes=((), ()))``
+                is the equivalent.
 
     References
     ----------
@@ -1226,11 +1229,39 @@ def rollaxis(a, axis, start=0):
     a : ndarray
         Input array.
     axis : int
-        The axis to roll backwards.  The positions of the other axes do not
+        The axis to be rolled. The positions of the other axes do not
         change relative to one another.
     start : int, optional
-        The axis is rolled until it lies before this position.  The default,
-        0, results in a "complete" roll.
+        When ``start <= axis``, the axis is rolled back until it lies in
+        this position. When ``start > axis``, the axis is rolled until it
+        lies before this position. The default, 0, results in a "complete"
+        roll. The following table describes how negative values of ``start``
+        are interpreted:
+
+        .. table::
+           :align: left
+
+           +-------------------+----------------------+
+           |     ``start``     | Normalized ``start`` |
+           +===================+======================+
+           | ``-(arr.ndim+1)`` | raise ``AxisError``  |
+           +-------------------+----------------------+
+           | ``-arr.ndim``     | 0                    |
+           +-------------------+----------------------+
+           | |vdots|           | |vdots|              |
+           +-------------------+----------------------+
+           | ``-1``            | ``arr.ndim-1``       |
+           +-------------------+----------------------+
+           | ``0``             | ``0``                |
+           +-------------------+----------------------+
+           | |vdots|           | |vdots|              |
+           +-------------------+----------------------+
+           | ``arr.ndim``      | ``arr.ndim``         |
+           +-------------------+----------------------+
+           | ``arr.ndim + 1``  | raise ``AxisError``  |
+           +-------------------+----------------------+
+           
+        .. |vdots|   unicode:: U+22EE .. Vertical Ellipsis
 
     Returns
     -------
@@ -2279,12 +2310,12 @@ def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
         return cond[()]  # Flatten 0d arrays to scalars
 
 
-def _array_equal_dispatcher(a1, a2):
+def _array_equal_dispatcher(a1, a2, equal_nan=None):
     return (a1, a2)
 
 
 @array_function_dispatch(_array_equal_dispatcher)
-def array_equal(a1, a2):
+def array_equal(a1, a2, equal_nan=False):
     """
     True if two arrays have the same shape and elements, False otherwise.
 
@@ -2292,6 +2323,12 @@ def array_equal(a1, a2):
     ----------
     a1, a2 : array_like
         Input arrays.
+    equal_nan : bool
+        Whether to compare NaN's as equal. If the dtype of a1 and a2 is
+        complex, values will be considered equal if either the real or the
+        imaginary component of a given value is ``nan``.
+
+        .. versionadded:: 1.19.0
 
     Returns
     -------
@@ -2315,7 +2352,21 @@ def array_equal(a1, a2):
     False
     >>> np.array_equal([1, 2], [1, 4])
     False
+    >>> a = np.array([1, np.nan])
+    >>> np.array_equal(a, a)
+    False
+    >>> np.array_equal(a, a, equal_nan=True)
+    True
 
+    When ``equal_nan`` is True, complex values with nan components are
+    considered equal if either the real *or* the imaginary components are nan.
+
+    >>> a = np.array([1 + 1j])
+    >>> b = a.copy()
+    >>> a.real = np.nan
+    >>> b.imag = np.nan
+    >>> np.array_equal(a, b, equal_nan=True)
+    True
     """
     try:
         a1, a2 = asarray(a1), asarray(a2)
@@ -2323,7 +2374,15 @@ def array_equal(a1, a2):
         return False
     if a1.shape != a2.shape:
         return False
-    return bool(asarray(a1 == a2).all())
+    if not equal_nan:
+        return bool(asarray(a1 == a2).all())
+    # Handling NaN values if equal_nan is True
+    a1nan, a2nan = isnan(a1), isnan(a2)
+    # NaN's occur at different locations
+    if not (a1nan == a2nan).all():
+        return False
+    # Shapes of a1, a2 and masks are guaranteed to be consistent by this point
+    return bool(asarray(a1[~a1nan] == a2[~a1nan]).all())
 
 
 def _array_equiv_dispatcher(a1, a2):
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 15e732614..76f3f5abe 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -961,7 +961,7 @@ def configuration(parent_package='',top_path=None):
     config.add_extension('_operand_flag_tests',
                     sources=[join('src', 'umath', '_operand_flag_tests.c.src')])
 
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
     config.add_data_dir('tests/data')
 
     config.make_svn_version_py()
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
index c2b7e9ca7..795fc7315 100644
--- a/numpy/core/src/multiarray/alloc.c
+++ b/numpy/core/src/multiarray/alloc.c
@@ -47,6 +47,32 @@ typedef struct {
 static cache_bucket datacache[NBUCKETS];
 static cache_bucket dimcache[NBUCKETS_DIM];
 
+static int _madvise_hugepage = 1;
+
+
+/*
+ * This function enables or disables the use of `MADV_HUGEPAGE` on Linux
+ * by modifying the global static `_madvise_hugepage`.
+ * It returns the previous value of `_madvise_hugepage`.
+ *
+ * It is exposed to Python as `np.core.multiarray._set_madvise_hugepage`.
+ */
+NPY_NO_EXPORT PyObject *
+_set_madvise_hugepage(PyObject *NPY_UNUSED(self), PyObject *enabled_obj)
+{
+    int was_enabled = _madvise_hugepage;
+    int enabled = PyObject_IsTrue(enabled_obj);
+    if (enabled < 0) {
+        return NULL;
+    }
+    _madvise_hugepage = enabled;
+    if (was_enabled) {
+        Py_RETURN_TRUE;
+    }
+    Py_RETURN_FALSE;
+}
+
+
 /* as the cache is managed in global variables verify the GIL is held */
 
 /*
@@ -75,7 +101,7 @@ _npy_alloc_cache(npy_uintp nelem, npy_uintp esz, npy_uint msz,
 #endif
 #ifdef NPY_OS_LINUX
         /* allow kernel allocating huge pages for large arrays */
-        if (NPY_UNLIKELY(nelem * esz >= ((1u<<22u)))) {
+        if (NPY_UNLIKELY(nelem * esz >= ((1u<<22u))) && _madvise_hugepage) {
             npy_uintp offset = 4096u - (npy_uintp)p % (4096u);
             npy_uintp length = nelem * esz - offset;
             /**
diff --git a/numpy/core/src/multiarray/alloc.h b/numpy/core/src/multiarray/alloc.h
index 2b69efc35..15e31ebb5 100644
--- a/numpy/core/src/multiarray/alloc.h
+++ b/numpy/core/src/multiarray/alloc.h
@@ -6,6 +6,9 @@
 
 #define NPY_TRACE_DOMAIN 389047
 
+NPY_NO_EXPORT PyObject *
+_set_madvise_hugepage(PyObject *NPY_UNUSED(self), PyObject *enabled_obj);
+
 NPY_NO_EXPORT void *
 npy_alloc_cache(npy_uintp sz);
 
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 5e07f0df4..024dcab8c 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -1489,6 +1489,7 @@ OBJECT_to_@TOTYPE@(void *input, void *output, npy_intp n,
  *
  * #from = STRING*23, UNICODE*23, VOID*23#
  * #fromtyp = npy_char*69#
+ * #is_string_to_bool = 1, 0*22, 1, 0*22, 0*23#
  * #to = (BOOL,
  *           BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *           LONG, ULONG, LONGLONG, ULONGLONG,
@@ -1525,6 +1526,13 @@ static void
         if (temp == NULL) {
             return;
         }
+#if @is_string_to_bool@
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
         if (@to@_setitem(temp, op, aop)) {
             Py_DECREF(temp);
             return;
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 1fef1f06b..e2026ec1c 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -1860,13 +1860,8 @@ array_reduce_ex(PyArrayObject *self, PyObject *args)
          * method that involves using a temporary bytes allocation. */
         return array_reduce_ex_regular(self, protocol);
     }
-    else if (protocol == 5) {
-        return array_reduce_ex_picklebuffer(self, protocol);
-    }
     else {
-        PyErr_Format(PyExc_ValueError,
-                     "__reduce_ex__ called with protocol > 5");
-        return NULL;
+        return array_reduce_ex_picklebuffer(self, protocol);
     }
 }
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 9e8022abd..4c316052d 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -34,6 +34,7 @@
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
 /* Internal APIs */
+#include "alloc.h"
 #include "arrayfunction_override.h"
 #include "arraytypes.h"
 #include "arrayobject.h"
@@ -4161,6 +4162,8 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS, NULL},
     {"_add_newdoc_ufunc", (PyCFunction)add_newdoc_ufunc,
         METH_VARARGS, NULL},
+    {"_set_madvise_hugepage", (PyCFunction)_set_madvise_hugepage,
+        METH_O, NULL},
     {NULL, NULL, 0, NULL}                /* sentinel */
 };
 
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index e6789e1d6..74bf01643 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -10,7 +10,11 @@
 #ifndef _NPY_UMATH_FAST_LOOP_MACROS_H_
 #define _NPY_UMATH_FAST_LOOP_MACROS_H_
 
-#include "simd.inc"
+static NPY_INLINE npy_uintp
+abs_ptrdiff(char *a, char *b)
+{
+    return (a > b) ? (a - b) : (b - a);
+}
 
 /**
  * Simple unoptimized loop macros that iterate over the ufunc arguments in
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index eea82309c..a5c663a47 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -38,6 +38,9 @@
 #define NPY_MAX_SIMD_SIZE 1024
 #endif
 
+/** Provides the various *_LOOP macros */
+#include "fast_loop_macros.h"
+
 /*
  * include vectorized functions and dispatchers
  * this file is safe to include also for generic builds
@@ -46,10 +49,6 @@
  */
 #include "simd.inc"
 
-/** Provides the various *_LOOP macros */
-#include "fast_loop_macros.h"
-
-
 /******************************************************************************
  **                          GENERIC FLOAT LOOPS                             **
  *****************************************************************************/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 106c7e7c9..6b0bcc3dc 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -50,12 +50,6 @@
  */
 #define MAX_STEP_SIZE 2097152
 
-static NPY_INLINE npy_uintp
-abs_ptrdiff(char *a, char *b)
-{
-    return (a > b) ? (a - b) : (b - a);
-}
-
 /*
  * nomemoverlap - returns true if two strided arrays have an overlapping
  * region in memory. ip_size/op_size = size of the arrays which can be negative
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index 5a801f61c..2600d409a 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -289,6 +289,34 @@ def test_array_astype_warning(t):
     a = np.array(10, dtype=np.complex_)
     assert_warns(np.ComplexWarning, a.astype, t)
 
+@pytest.mark.parametrize(["dtype", "out_dtype"],
+        [(np.bytes_, np.bool_),
+         (np.unicode, np.bool_),
+         (np.dtype("S10,S9"), np.dtype("?,?"))])
+def test_string_to_boolean_cast(dtype, out_dtype):
+    """
+    Currently, for `astype` strings are cast to booleans effectively by
+    calling `bool(int(string)`. This is not consistent (see gh-9875) and
+    will eventually be deprecated.
+    """
+    arr = np.array(["10", "10\0\0\0", "0\0\0", "0"], dtype=dtype)
+    expected = np.array([True, True, False, False], dtype=out_dtype)
+    assert_array_equal(arr.astype(out_dtype), expected)
+
+@pytest.mark.parametrize(["dtype", "out_dtype"],
+        [(np.bytes_, np.bool_),
+         (np.unicode, np.bool_),
+         (np.dtype("S10,S9"), np.dtype("?,?"))])
+def test_string_to_boolean_cast_errors(dtype, out_dtype):
+    """
+    These currently error out, since cast to integers fails, but should not
+    error out in the future.
+    """
+    for invalid in ["False", "True", "", "\0", "non-empty"]:
+        arr = np.array([invalid], dtype=dtype)
+        with assert_raises(ValueError):
+            arr.astype(out_dtype)
+
 def test_copyto_fromscalar():
     a = np.arange(6, dtype='f4').reshape(2, 3)
 
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index bcc6a0c4e..acd442e2f 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -1446,6 +1446,36 @@ class TestArrayComparisons:
         assert_(res)
         assert_(type(res) is bool)
 
+    def test_array_equal_equal_nan(self):
+        # Test array_equal with equal_nan kwarg
+        a1 = np.array([1, 2, np.nan])
+        a2 = np.array([1, np.nan, 2])
+        a3 = np.array([1, 2, np.inf])
+
+        # equal_nan=False by default
+        assert_(not np.array_equal(a1, a1))
+        assert_(np.array_equal(a1, a1, equal_nan=True))
+        assert_(not np.array_equal(a1, a2, equal_nan=True))
+        # nan's not conflated with inf's
+        assert_(not np.array_equal(a1, a3, equal_nan=True))
+        # 0-D arrays
+        a = np.array(np.nan)
+        assert_(not np.array_equal(a, a))
+        assert_(np.array_equal(a, a, equal_nan=True))
+        # Non-float dtype - equal_nan should have no effect
+        a = np.array([1, 2, 3], dtype=int)
+        assert_(np.array_equal(a, a))
+        assert_(np.array_equal(a, a, equal_nan=True))
+        # Multi-dimensional array
+        a = np.array([[0, 1], [np.nan, 1]])
+        assert_(not np.array_equal(a, a))
+        assert_(np.array_equal(a, a, equal_nan=True))
+        # Complex values
+        a, b = [np.array([1 + 1j])]*2
+        a.real, b.imag = np.nan, np.nan
+        assert_(not np.array_equal(a, b, equal_nan=False))
+        assert_(np.array_equal(a, b, equal_nan=True))
+
     def test_none_compares_elementwise(self):
         a = np.array([None, 1, None], dtype=object)
         assert_equal(a == None, [True, False, True])
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 60c9fe437..10c652ad4 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -21,6 +21,27 @@ def on_powerpc():
            platform.machine().startswith('ppc')
 
 
+def bad_arcsinh():
+    """The blacklisted trig functions are not accurate on aarch64 for
+    complex256. Rather than dig through the actual problem skip the
+    test. This should be fixed when we can move past glibc2.17
+    which is the version in manylinux2014
+    """
+    x = 1.78e-10
+    v1 = np.arcsinh(np.float128(x))
+    v2 = np.arcsinh(np.complex256(x)).real
+    # The eps for float128 is 1-e33, so this is way bigger
+    return abs((v1 / v2) - 1.0) > 1e-23
+
+if platform.machine() == 'aarch64' and bad_arcsinh():
+    skip_longcomplex_msg = ('Trig functions of np.longcomplex values known to be '
+                            'inaccurate on aarch64 for some compilation '
+                            'configurations, should be fixed by building on a '
+                            'platform using glibc>2.17')
+else:
+    skip_longcomplex_msg = ''
+
+
 class _FilterInvalids:
     def setup(self):
         self.olderr = np.seterr(invalid='ignore')
@@ -2823,6 +2844,8 @@ class TestComplexFunctions:
             # are accurate down to a few epsilons. (Eg. on Linux 64-bit)
             # So, give more leeway for long complex tests here:
             # Can use 2.1 for > Ubuntu LTS Trusty (2014), glibc = 2.19.
+            if skip_longcomplex_msg:
+                pytest.skip(skip_longcomplex_msg)
             check(x_series, 50.0*eps)
         else:
             check(x_series, 2.1*eps)
diff --git a/numpy/distutils/fcompiler/intel.py b/numpy/distutils/fcompiler/intel.py
index d84f38c76..c7b3c2340 100644
--- a/numpy/distutils/fcompiler/intel.py
+++ b/numpy/distutils/fcompiler/intel.py
@@ -59,7 +59,7 @@ class IntelFCompiler(BaseIntelFCompiler):
     def get_flags_opt(self):  # Scipy test failures with -O2
         v = self.get_version()
         mpopt = 'openmp' if v and v < '15' else 'qopenmp'
-        return ['-fp-model strict -O1 -{}'.format(mpopt)]
+        return ['-fp-model', 'strict', '-O1', '-{}'.format(mpopt)]
 
     def get_flags_arch(self):
         return []
@@ -125,10 +125,10 @@ class IntelEM64TFCompiler(IntelFCompiler):
     def get_flags_opt(self):  # Scipy test failures with -O2
         v = self.get_version()
         mpopt = 'openmp' if v and v < '15' else 'qopenmp'
-        return ['-fp-model strict -O1 -{}'.format(mpopt)]
+        return ['-fp-model', 'strict', '-O1', '-{}'.format(mpopt)]
 
     def get_flags_arch(self):
-        return ['']
+        return []
 
 # Is there no difference in the version string between the above compilers
 # and the Visual compilers?
@@ -210,7 +210,7 @@ class IntelEM64VisualFCompiler(IntelVisualFCompiler):
     version_match = simple_version_match(start=r'Intel\(R\).*?64,')
 
     def get_flags_arch(self):
-        return ['']
+        return []
 
 
 if __name__ == '__main__':
diff --git a/numpy/distutils/setup.py b/numpy/distutils/setup.py
index 69d35f5c2..88cd1a160 100644
--- a/numpy/distutils/setup.py
+++ b/numpy/distutils/setup.py
@@ -4,7 +4,7 @@ def configuration(parent_package='',top_path=None):
     config = Configuration('distutils', parent_package, top_path)
     config.add_subpackage('command')
     config.add_subpackage('fcompiler')
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
     config.add_data_files('site.cfg')
     config.add_data_files('mingw/gfortran_vs2003_hack.c')
     config.make_config_py()
diff --git a/numpy/f2py/setup.py b/numpy/f2py/setup.py
index 6314c5af3..80b47e527 100644
--- a/numpy/f2py/setup.py
+++ b/numpy/f2py/setup.py
@@ -25,7 +25,8 @@ from __version__ import version
 
 def configuration(parent_package='', top_path=None):
     config = Configuration('f2py', parent_package, top_path)
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
+    config.add_data_dir('tests/src')
     config.add_data_files(
         'src/fortranobject.c',
         'src/fortranobject.h')
diff --git a/numpy/fft/setup.py b/numpy/fft/setup.py
index e8204fcd3..9ed824e4f 100644
--- a/numpy/fft/setup.py
+++ b/numpy/fft/setup.py
@@ -4,7 +4,7 @@ def configuration(parent_package='',top_path=None):
     from numpy.distutils.misc_util import Configuration
     config = Configuration('fft', parent_package, top_path)
 
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
 
     # AIX needs to be told to use large file support - at all times
     defs = [('_LARGE_FILES', None)] if sys.platform[:3] == "aix" else []
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index ff5b94342..84aff5e5d 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -165,7 +165,6 @@ class LineSplitter:
 
         """
         return lambda input: [_.strip() for _ in method(input)]
-    #
 
     def __init__(self, delimiter=None, comments='#', autostrip=True,
                  encoding=None):
@@ -195,7 +194,6 @@ class LineSplitter:
         else:
             self._handyman = _handyman
         self.encoding = encoding
-    #
 
     def _delimited_splitter(self, line):
         """Chop off comments, strip, and split at delimiter. """
@@ -205,7 +203,6 @@ class LineSplitter:
         if not line:
             return []
         return line.split(self.delimiter)
-    #
 
     def _fixedwidth_splitter(self, line):
         if self.comments is not None:
@@ -216,7 +213,6 @@ class LineSplitter:
         fixed = self.delimiter
         slices = [slice(i, i + fixed) for i in range(0, len(line), fixed)]
         return [line[s] for s in slices]
-    #
 
     def _variablewidth_splitter(self, line):
         if self.comments is not None:
@@ -225,7 +221,6 @@ class LineSplitter:
             return []
         slices = self.delimiter
         return [line[s] for s in slices]
-    #
 
     def __call__(self, line):
         return self._handyman(_decode_line(line, self.encoding))
@@ -282,10 +277,9 @@ class NameValidator:
     ('EXCL', 'FIELD2', 'NO_Q', 'WITH_SPACE', 'CASE')
 
     """
-    #
+
     defaultexcludelist = ['return', 'file', 'print']
     defaultdeletechars = set(r"""~!@#$%^&*()-=+~\|]}[{';: /?.>,<""")
-    #
 
     def __init__(self, excludelist=None, deletechars=None,
                  case_sensitive=None, replace_space='_'):
@@ -311,7 +305,7 @@ class NameValidator:
         else:
             msg = 'unrecognized case_sensitive value %s.' % case_sensitive
             raise ValueError(msg)
-        #
+
         self.replace_space = replace_space
 
     def validate(self, names, defaultfmt="f%i", nbfields=None):
@@ -362,7 +356,7 @@ class NameValidator:
         validatednames = []
         seen = dict()
         nbempty = 0
-        #
+
         for item in names:
             item = case_converter(item).strip()
             if replace_space:
@@ -383,7 +377,6 @@ class NameValidator:
                 validatednames.append(item)
             seen[item] = cnt + 1
         return tuple(validatednames)
-    #
 
     def __call__(self, names, defaultfmt="f%i", nbfields=None):
         return self.validate(names, defaultfmt=defaultfmt, nbfields=nbfields)
@@ -502,7 +495,6 @@ class StringConverter:
         upgrade or not. Default is False.
 
     """
-    #
     _mapper = [(nx.bool_, str2bool, False),
                (nx.int_, int, -1),]
 
@@ -522,55 +514,51 @@ class StringConverter:
                     (nx.floating, float, nx.nan),
                     (nx.complexfloating, complex, nx.nan + 0j),])
 
-    (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper)
-
     @classmethod
     def _getdtype(cls, val):
         """Returns the dtype of the input variable."""
         return np.array(val).dtype
-    #
 
     @classmethod
     def _getsubdtype(cls, val):
         """Returns the type of the dtype of the input variable."""
         return np.array(val).dtype.type
-    #
-    # This is a bit annoying. We want to return the "general" type in most
-    # cases (ie. "string" rather than "S10"), but we want to return the
-    # specific type for datetime64 (ie. "datetime64[us]" rather than
-    # "datetime64").
 
     @classmethod
     def _dtypeortype(cls, dtype):
         """Returns dtype for datetime64 and type of dtype otherwise."""
+
+        # This is a bit annoying. We want to return the "general" type in most
+        # cases (ie. "string" rather than "S10"), but we want to return the
+        # specific type for datetime64 (ie. "datetime64[us]" rather than
+        # "datetime64").
         if dtype.type == np.datetime64:
             return dtype
         return dtype.type
-    #
 
     @classmethod
     def upgrade_mapper(cls, func, default=None):
         """
-    Upgrade the mapper of a StringConverter by adding a new function and
-    its corresponding default.
-
-    The input function (or sequence of functions) and its associated
-    default value (if any) is inserted in penultimate position of the
-    mapper.  The corresponding type is estimated from the dtype of the
-    default value.
+        Upgrade the mapper of a StringConverter by adding a new function and
+        its corresponding default.
 
-    Parameters
-    ----------
-    func : var
-        Function, or sequence of functions
+        The input function (or sequence of functions) and its associated
+        default value (if any) is inserted in penultimate position of the
+        mapper.  The corresponding type is estimated from the dtype of the
+        default value.
 
-    Examples
-    --------
-    >>> import dateutil.parser
-    >>> import datetime
-    >>> dateparser = dateutil.parser.parse
-    >>> defaultdate = datetime.date(2000, 1, 1)
-    >>> StringConverter.upgrade_mapper(dateparser, default=defaultdate)
+        Parameters
+        ----------
+        func : var
+            Function, or sequence of functions
+
+        Examples
+        --------
+        >>> import dateutil.parser
+        >>> import datetime
+        >>> dateparser = dateutil.parser.parse
+        >>> defaultdate = datetime.date(2000, 1, 1)
+        >>> StringConverter.upgrade_mapper(dateparser, default=defaultdate)
         """
         # Func is a single functions
         if hasattr(func, '__call__'):
@@ -586,9 +574,22 @@ class StringConverter:
             else:
                 default = list(default)
                 default.append([None] * (len(func) - len(default)))
-            for (fct, dft) in zip(func, default):
+            for fct, dft in zip(func, default):
                 cls._mapper.insert(-1, (cls._getsubdtype(dft), fct, dft))
-    #
+
+    @classmethod
+    def _find_map_entry(cls, dtype):
+        # if a converter for the specific dtype is available use that
+        for i, (deftype, func, default_def) in enumerate(cls._mapper):
+            if dtype.type == deftype:
+                return i, (deftype, func, default_def)
+
+        # otherwise find an inexact match
+        for i, (deftype, func, default_def) in enumerate(cls._mapper):
+            if np.issubdtype(dtype.type, deftype):
+                return i, (deftype, func, default_def)
+
+        raise LookupError
 
     def __init__(self, dtype_or_func=None, default=None, missing_values=None,
                  locked=False):
@@ -621,36 +622,26 @@ class StringConverter:
                     except ValueError:
                         default = None
                 dtype = self._getdtype(default)
-            # Set the status according to the dtype
-            _status = -1
-            for (i, (deftype, func, default_def)) in enumerate(self._mapper):
-                if np.issubdtype(dtype.type, deftype):
-                    _status = i
-                    if default is None:
-                        self.default = default_def
-                    else:
-                        self.default = default
-                    break
-            # if a converter for the specific dtype is available use that
-            last_func = func
-            for (i, (deftype, func, default_def)) in enumerate(self._mapper):
-                if dtype.type == deftype:
-                    _status = i
-                    last_func = func
-                    if default is None:
-                        self.default = default_def
-                    else:
-                        self.default = default
-                    break
-            func = last_func
-            if _status == -1:
-                # We never found a match in the _mapper...
-                _status = 0
+
+            # find the best match in our mapper
+            try:
+                self._status, (_, func, default_def) = self._find_map_entry(dtype)
+            except LookupError:
+                # no match
                 self.default = default
-            self._status = _status
+                _, func, _ = self._mapper[-1]
+                self._status = 0
+            else:
+                # use the found default only if we did not already have one
+                if default is None:
+                    self.default = default_def
+                else:
+                    self.default = default
+
             # If the input was a dtype, set the function to the last we saw
             if self.func is None:
                 self.func = func
+
             # If the status is 1 (int), change the function to
             # something more robust.
             if self.func == self._mapper[1][1]:
@@ -667,19 +658,17 @@ class StringConverter:
             if isinstance(missing_values, str):
                 missing_values = missing_values.split(",")
             self.missing_values = set(list(missing_values) + [''])
-        #
+
         self._callingfunction = self._strict_call
         self.type = self._dtypeortype(dtype)
         self._checked = False
         self._initial_default = default
-    #
 
     def _loose_call(self, value):
         try:
             return self.func(value)
         except ValueError:
             return self.default
-    #
 
     def _strict_call(self, value):
         try:
@@ -705,11 +694,9 @@ class StringConverter:
                     self._checked = False
                 return self.default
             raise ValueError("Cannot convert string '%s'" % value)
-    #
 
     def __call__(self, value):
         return self._callingfunction(value)
-    #
 
     def _do_upgrade(self):
         # Raise an exception if we locked the converter...
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 7eeed7825..dea01d12d 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -764,6 +764,30 @@ def copy(a, order='K', subok=False):
     >>> x[0] == z[0]
     False
 
+    Note that np.copy is a shallow copy and will not copy object
+    elements within arrays. This is mainly important for arrays
+    containing Python objects. The new array will contain the
+    same object which may lead to surprises if that object can
+    be modified (is mutable):
+
+    >>> a = np.array([1, 'm', [2, 3, 4]], dtype=object)
+    >>> b = np.copy(a)
+    >>> b[2][0] = 10
+    >>> a
+    array([1, 'm', list([10, 3, 4])], dtype=object)
+
+    To ensure all elements within an ``object`` array are copied,
+    use `copy.deepcopy`:
+
+    >>> import copy
+    >>> a = np.array([1, 'm', [2, 3, 4]], dtype=object)
+    >>> c = copy.deepcopy(a)
+    >>> c[2][0] = 10
+    >>> c
+    array([1, 'm', list([10, 3, 4])], dtype=object)
+    >>> a
+    array([1, 'm', list([2, 3, 4])], dtype=object)
+
     """
     return array(a, order=order, subok=subok, copy=True)
 
@@ -2026,7 +2050,7 @@ class vectorize:
         self.pyfunc = pyfunc
         self.cache = cache
         self.signature = signature
-        self._ufunc = None    # Caching to improve default performance
+        self._ufunc = {}    # Caching to improve default performance
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -2091,14 +2115,22 @@ class vectorize:
 
         if self.otypes is not None:
             otypes = self.otypes
-            nout = len(otypes)
 
-            # Note logic here: We only *use* self._ufunc if func is self.pyfunc
-            # even though we set self._ufunc regardless.
-            if func is self.pyfunc and self._ufunc is not None:
-                ufunc = self._ufunc
+            # self._ufunc is a dictionary whose keys are the number of
+            # arguments (i.e. len(args)) and whose values are ufuncs created
+            # by frompyfunc. len(args) can be different for different calls if
+            # self.pyfunc has parameters with default values.  We only use the
+            # cache when func is self.pyfunc, which occurs when the call uses
+            # only positional arguments and no arguments are excluded.
+
+            nin = len(args)
+            nout = len(self.otypes)
+            if func is not self.pyfunc or nin not in self._ufunc:
+                ufunc = frompyfunc(func, nin, nout)
             else:
-                ufunc = self._ufunc = frompyfunc(func, len(args), nout)
+                ufunc = None  # We'll get it from self._ufunc
+            if func is self.pyfunc:
+                ufunc = self._ufunc.setdefault(nin, ufunc)
         else:
             # Get number of outputs and output types by calling the function on
             # the first entries of args.  We also cache the result to prevent
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index f080cc392..1a9b41ced 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -1047,7 +1047,15 @@ def histogramdd(sample, bins=10, range=None, normed=None, weights=None,
                 raise ValueError(
                     '`bins[{}]` must be positive, when an integer'.format(i))
             smin, smax = _get_outer_edges(sample[:,i], range[i])
-            edges[i] = np.linspace(smin, smax, bins[i] + 1)
+            try:
+                n = operator.index(bins[i])
+            
+            except TypeError as e:
+                raise TypeError(
+                	"`bins[{}]` must be an integer, when a scalar".format(i)
+                ) from e
+                
+            edges[i] = np.linspace(smin, smax, n + 1)    
         elif np.ndim(bins[i]) == 1:
             edges[i] = np.asarray(bins[i])
             if np.any(edges[i][:-1] > edges[i][1:]):
diff --git a/numpy/lib/setup.py b/numpy/lib/setup.py
index 5d0341d86..b3f441f38 100644
--- a/numpy/lib/setup.py
+++ b/numpy/lib/setup.py
@@ -2,7 +2,8 @@ def configuration(parent_package='',top_path=None):
     from numpy.distutils.misc_util import Configuration
 
     config = Configuration('lib', parent_package, top_path)
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
+    config.add_data_dir('tests/data')
     return config
 
 if __name__ == '__main__':
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 23bf3296d..b4e928273 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -3,6 +3,7 @@ import warnings
 import sys
 import decimal
 from fractions import Fraction
+import math
 import pytest
 
 import numpy as np
@@ -1221,6 +1222,16 @@ class TestExtins:
         assert_array_equal(a, ac)
 
 
+# _foo1 and _foo2 are used in some tests in TestVectorize.
+
+def _foo1(x, y=1.0):
+    return y*math.floor(x)
+
+
+def _foo2(x, y=1.0, z=0.0):
+    return y*math.floor(x) + z
+
+
 class TestVectorize:
 
     def test_simple(self):
@@ -1252,7 +1263,6 @@ class TestVectorize:
         assert_array_equal(y, x)
 
     def test_ufunc(self):
-        import math
         f = vectorize(math.cos)
         args = np.array([0, 0.5 * np.pi, np.pi, 1.5 * np.pi, 2 * np.pi])
         r1 = f(args)
@@ -1273,6 +1283,63 @@ class TestVectorize:
         r2 = np.array([3, 4, 5])
         assert_array_equal(r1, r2)
 
+    def test_keywords_with_otypes_order1(self):
+        # gh-1620: The second call of f would crash with
+        # `ValueError: invalid number of arguments`.
+        f = vectorize(_foo1, otypes=[float])
+        # We're testing the caching of ufuncs by vectorize, so the order
+        # of these function calls is an important part of the test.
+        r1 = f(np.arange(3.0), 1.0)
+        r2 = f(np.arange(3.0))
+        assert_array_equal(r1, r2)
+
+    def test_keywords_with_otypes_order2(self):
+        # gh-1620: The second call of f would crash with
+        # `ValueError: non-broadcastable output operand with shape ()
+        # doesn't match the broadcast shape (3,)`.
+        f = vectorize(_foo1, otypes=[float])
+        # We're testing the caching of ufuncs by vectorize, so the order
+        # of these function calls is an important part of the test.
+        r1 = f(np.arange(3.0))
+        r2 = f(np.arange(3.0), 1.0)
+        assert_array_equal(r1, r2)
+
+    def test_keywords_with_otypes_order3(self):
+        # gh-1620: The third call of f would crash with
+        # `ValueError: invalid number of arguments`.
+        f = vectorize(_foo1, otypes=[float])
+        # We're testing the caching of ufuncs by vectorize, so the order
+        # of these function calls is an important part of the test.
+        r1 = f(np.arange(3.0))
+        r2 = f(np.arange(3.0), y=1.0)
+        r3 = f(np.arange(3.0))
+        assert_array_equal(r1, r2)
+        assert_array_equal(r1, r3)
+
+    def test_keywords_with_otypes_several_kwd_args1(self):
+        # gh-1620 Make sure different uses of keyword arguments
+        # don't break the vectorized function.
+        f = vectorize(_foo2, otypes=[float])
+        # We're testing the caching of ufuncs by vectorize, so the order
+        # of these function calls is an important part of the test.
+        r1 = f(10.4, z=100)
+        r2 = f(10.4, y=-1)
+        r3 = f(10.4)
+        assert_equal(r1, _foo2(10.4, z=100))
+        assert_equal(r2, _foo2(10.4, y=-1))
+        assert_equal(r3, _foo2(10.4))
+
+    def test_keywords_with_otypes_several_kwd_args2(self):
+        # gh-1620 Make sure different uses of keyword arguments
+        # don't break the vectorized function.
+        f = vectorize(_foo2, otypes=[float])
+        # We're testing the caching of ufuncs by vectorize, so the order
+        # of these function calls is an important part of the test.
+        r1 = f(z=100, x=10.4, y=-1)
+        r2 = f(1, 2, 3)
+        assert_equal(r1, _foo2(z=100, x=10.4, y=-1))
+        assert_equal(r2, _foo2(1, 2, 3))
+
     def test_keywords_no_func_code(self):
         # This needs to test a function that has keywords but
         # no func_code attribute, since otherwise vectorize will
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 6d3afdd49..5ee326f3c 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -2613,12 +2613,13 @@ def norm(x, ord=None, axis=None, keepdims=False):
 
 # multi_dot
 
-def _multidot_dispatcher(arrays):
-    return arrays
+def _multidot_dispatcher(arrays, *, out=None):
+    yield from arrays
+    yield out
 
 
 @array_function_dispatch(_multidot_dispatcher)
-def multi_dot(arrays):
+def multi_dot(arrays, *, out=None):
     """
     Compute the dot product of two or more arrays in a single function call,
     while automatically selecting the fastest evaluation order.
@@ -2642,6 +2643,15 @@ def multi_dot(arrays):
         If the first argument is 1-D it is treated as row vector.
         If the last argument is 1-D it is treated as column vector.
         The other arguments must be 2-D.
+    out : ndarray, optional
+        Output argument. This must have the exact kind that would be returned
+        if it was not used. In particular, it must have the right type, must be
+        C-contiguous, and its dtype must be the dtype that would be returned
+        for `dot(a, b)`. This is a performance feature. Therefore, if these
+        conditions are not met, an exception is raised, instead of attempting
+        to be flexible.
+
+        .. versionadded:: 1.19.0
 
     Returns
     -------
@@ -2699,7 +2709,7 @@ def multi_dot(arrays):
     if n < 2:
         raise ValueError("Expecting at least two arrays.")
     elif n == 2:
-        return dot(arrays[0], arrays[1])
+        return dot(arrays[0], arrays[1], out=out)
 
     arrays = [asanyarray(a) for a in arrays]
 
@@ -2715,10 +2725,10 @@ def multi_dot(arrays):
 
     # _multi_dot_three is much faster than _multi_dot_matrix_chain_order
     if n == 3:
-        result = _multi_dot_three(arrays[0], arrays[1], arrays[2])
+        result = _multi_dot_three(arrays[0], arrays[1], arrays[2], out=out)
     else:
         order = _multi_dot_matrix_chain_order(arrays)
-        result = _multi_dot(arrays, order, 0, n - 1)
+        result = _multi_dot(arrays, order, 0, n - 1, out=out)
 
     # return proper shape
     if ndim_first == 1 and ndim_last == 1:
@@ -2729,7 +2739,7 @@ def multi_dot(arrays):
         return result
 
 
-def _multi_dot_three(A, B, C):
+def _multi_dot_three(A, B, C, out=None):
     """
     Find the best order for three arrays and do the multiplication.
 
@@ -2745,9 +2755,9 @@ def _multi_dot_three(A, B, C):
     cost2 = a1b0 * c1 * (a0 + b1c0)
 
     if cost1 < cost2:
-        return dot(dot(A, B), C)
+        return dot(dot(A, B), C, out=out)
     else:
-        return dot(A, dot(B, C))
+        return dot(A, dot(B, C), out=out)
 
 
 def _multi_dot_matrix_chain_order(arrays, return_costs=False):
@@ -2791,10 +2801,14 @@ def _multi_dot_matrix_chain_order(arrays, return_costs=False):
     return (s, m) if return_costs else s
 
 
-def _multi_dot(arrays, order, i, j):
+def _multi_dot(arrays, order, i, j, out=None):
     """Actually do the multiplication with the given order."""
     if i == j:
+        # the initial call with non-None out should never get here
+        assert out is None
+
         return arrays[i]
     else:
         return dot(_multi_dot(arrays, order, i, order[i, j]),
-                   _multi_dot(arrays, order, order[i, j] + 1, j))
+                   _multi_dot(arrays, order, order[i, j] + 1, j),
+                   out=out)
diff --git a/numpy/linalg/setup.py b/numpy/linalg/setup.py
index acfab0a68..57fdd502b 100644
--- a/numpy/linalg/setup.py
+++ b/numpy/linalg/setup.py
@@ -6,7 +6,7 @@ def configuration(parent_package='', top_path=None):
     from numpy.distutils.system_info import get_info, system_info
     config = Configuration('linalg', parent_package, top_path)
 
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
 
     # Configure lapack_lite
 
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index dae4ef61e..3f3bf9f70 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -1930,6 +1930,41 @@ class TestMultiDot:
         # the result should be a scalar
         assert_equal(multi_dot([A1d, B, C, D1d]).shape, ())
 
+    def test_three_arguments_and_out(self):
+        # multi_dot with three arguments uses a fast hand coded algorithm to
+        # determine the optimal order. Therefore test it separately.
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+
+        out = np.zeros((6, 2))
+        ret = multi_dot([A, B, C], out=out)
+        assert out is ret
+        assert_almost_equal(out, A.dot(B).dot(C))
+        assert_almost_equal(out, np.dot(A, np.dot(B, C)))
+
+    def test_two_arguments_and_out(self):
+        # separate code path with two arguments
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        out = np.zeros((6, 6))
+        ret = multi_dot([A, B], out=out)
+        assert out is ret
+        assert_almost_equal(out, A.dot(B))
+        assert_almost_equal(out, np.dot(A, B))
+
+    def test_dynamic_programing_optimization_and_out(self):
+        # multi_dot with four or more arguments uses the dynamic programing
+        # optimization and therefore deserve a separate test
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D = np.random.random((2, 1))
+        out = np.zeros((6, 1))
+        ret = multi_dot([A, B, C, D], out=out)
+        assert out is ret
+        assert_almost_equal(out, A.dot(B).dot(C).dot(D))
+
     def test_dynamic_programming_logic(self):
         # Test for the dynamic programming part
         # This test is directly taken from Cormen page 376.
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index a5e59bb74..a7214f9bf 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -285,8 +285,10 @@ def _extremum_fill_value(obj, extremum, extremum_name):
     def _scalar_fill_value(dtype):
         try:
             return extremum[dtype]
-        except KeyError:
-            raise TypeError(f"Unsuitable type {dtype} for calculating {extremum_name}.")
+        except KeyError as e:
+            raise TypeError(
+                f"Unsuitable type {dtype} for calculating {extremum_name}."
+            ) from None
 
     dtype = _get_dtype_of(obj)
     return _recursive_fill_value(dtype, _scalar_fill_value)
diff --git a/numpy/ma/setup.py b/numpy/ma/setup.py
index 144a961c2..d3f34c874 100644
--- a/numpy/ma/setup.py
+++ b/numpy/ma/setup.py
@@ -2,7 +2,7 @@
 def configuration(parent_package='',top_path=None):
     from numpy.distutils.misc_util import Configuration
     config = Configuration('ma', parent_package, top_path)
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
     return config
 
 if __name__ == "__main__":
diff --git a/numpy/matrixlib/setup.py b/numpy/matrixlib/setup.py
index 529d2a2eb..19b3bb2de 100644
--- a/numpy/matrixlib/setup.py
+++ b/numpy/matrixlib/setup.py
@@ -2,7 +2,7 @@
 def configuration(parent_package='', top_path=None):
     from numpy.distutils.misc_util import Configuration
     config = Configuration('matrixlib', parent_package, top_path)
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
     return config
 
 if __name__ == "__main__":
diff --git a/numpy/polynomial/setup.py b/numpy/polynomial/setup.py
index 8fc82cba1..641464518 100644
--- a/numpy/polynomial/setup.py
+++ b/numpy/polynomial/setup.py
@@ -1,7 +1,7 @@
 def configuration(parent_package='',top_path=None):
     from numpy.distutils.misc_util import Configuration
     config = Configuration('polynomial', parent_package, top_path)
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
     return config
 
 if __name__ == '__main__':
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index b976d51c6..27cb2859e 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -4007,10 +4007,12 @@ cdef class Generator:
         # return val
 
         cdef np.npy_intp k, totsize, i, j
-        cdef np.ndarray alpha_arr, val_arr
+        cdef np.ndarray alpha_arr, val_arr, alpha_csum_arr
+        cdef double csum
         cdef double *alpha_data
+        cdef double *alpha_csum_data
         cdef double *val_data
-        cdef double acc, invacc
+        cdef double acc, invacc, v
 
         k = len(alpha)
         alpha_arr = <np.ndarray>np.PyArray_FROMANY(
@@ -4034,17 +4036,74 @@ cdef class Generator:
 
         i = 0
         totsize = np.PyArray_SIZE(val_arr)
-        with self.lock, nogil:
-            while i < totsize:
-                acc = 0.0
-                for j in range(k):
-                    val_data[i+j] = random_standard_gamma(&self._bitgen,
-                                                              alpha_data[j])
-                    acc = acc + val_data[i + j]
-                invacc = 1/acc
-                for j in range(k):
-                    val_data[i + j] = val_data[i + j] * invacc
-                i = i + k
+
+        # Select one of the following two algorithms for the generation
+        #  of Dirichlet random variates (RVs)
+        #
+        # A) Small alpha case: Use the stick-breaking approach with beta
+        #    random variates (RVs).
+        # B) Standard case: Perform unit normalisation of a vector
+        #    of gamma random variates
+        #
+        # A) prevents NaNs resulting from 0/0 that may occur in B)
+        # when all values in the vector ':math:\\alpha' are smaller
+        # than 1, then there is a nonzero probability that all
+        # generated gamma RVs will be 0. When that happens, the
+        # normalization process ends up computing 0/0, giving nan. A)
+        # does not use divisions, so that a situation in which 0/0 has
+        # to be computed cannot occur. A) is slower than B) as
+        # generation of beta RVs is slower than generation of gamma
+        # RVs. A) is selected whenever `alpha.max() < t`, where `t <
+        # 1` is a threshold that controls the probability of
+        # generating a NaN value when B) is used. For a given
+        # threshold `t` this probability can be bounded by
+        # `gammainc(t, d)` where `gammainc` is the regularized
+        # incomplete gamma function and `d` is the smallest positive
+        # floating point number that can be represented with a given
+        # precision. For the chosen threshold `t=0.1` this probability
+        # is smaller than `1.8e-31` for double precision floating
+        # point numbers.
+
+        if (k > 0) and (alpha_arr.max() < 0.1):
+            # Small alpha case: Use stick-breaking approach with beta
+            # random variates (RVs).
+            # alpha_csum_data will hold the cumulative sum, right to
+            # left, of alpha_arr.
+            # Use a numpy array for memory management only.  We could just as
+            # well have malloc'd alpha_csum_data.  alpha_arr is a C-contiguous
+            # double array, therefore so is alpha_csum_arr.
+            alpha_csum_arr = np.empty_like(alpha_arr)
+            alpha_csum_data = <double*>np.PyArray_DATA(alpha_csum_arr)
+            csum = 0.0
+            for j in range(k - 1, -1, -1):
+                csum += alpha_data[j]
+                alpha_csum_data[j] = csum
+
+            with self.lock, nogil:
+                while i < totsize:
+                    acc = 1.
+                    for j in range(k - 1):
+                        v = random_beta(&self._bitgen, alpha_data[j],
+                                        alpha_csum_data[j + 1])
+                        val_data[i + j] = acc * v
+                        acc *= (1. - v)
+                    val_data[i + k - 1] = acc
+                    i = i + k
+
+        else:
+            # Standard case: Unit normalisation of a vector of gamma random
+            # variates
+            with self.lock, nogil:
+                while i < totsize:
+                    acc = 0.
+                    for j in range(k):
+                        val_data[i + j] = random_standard_gamma(&self._bitgen,
+                                                                alpha_data[j])
+                        acc = acc + val_data[i + j]
+                    invacc = 1. / acc
+                    for j in range(k):
+                        val_data[i + j] = val_data[i + j] * invacc
+                    i = i + k
 
         return diric
 
diff --git a/numpy/random/_pcg64.pyx b/numpy/random/_pcg64.pyx
index 05d27db5c..605aae4bc 100644
--- a/numpy/random/_pcg64.pyx
+++ b/numpy/random/_pcg64.pyx
@@ -38,7 +38,7 @@ cdef double pcg64_double(void* st) nogil:
 
 cdef class PCG64(BitGenerator):
     """
-    PCG64(seed_seq=None)
+    PCG64(seed=None)
 
     BitGenerator for the PCG-64 pseudo-random number generator.
 
diff --git a/numpy/random/setup.py b/numpy/random/setup.py
index 90ec42671..88ddb1268 100644
--- a/numpy/random/setup.py
+++ b/numpy/random/setup.py
@@ -31,7 +31,8 @@ def configuration(parent_package='', top_path=None):
                 ('_LARGEFILE64_SOURCE', '1')]
 
     defs.append(('NPY_NO_DEPRECATED_API', 0))
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
+    config.add_data_dir('tests/data')
     config.add_data_dir('_examples')
 
     EXTRA_LINK_ARGS = []
diff --git a/numpy/random/tests/test_extending.py b/numpy/random/tests/test_extending.py
index f7efafba9..77353463e 100644
--- a/numpy/random/tests/test_extending.py
+++ b/numpy/random/tests/test_extending.py
@@ -46,14 +46,24 @@ def test_cython(tmp_path):
     srcdir = os.path.join(os.path.dirname(__file__), '..')
     shutil.copytree(srcdir, tmp_path / 'random')
     # build the examples and "install" them into a temporary directory
-    env = os.environ.copy()
+    build_dir = tmp_path / 'random' / '_examples' / 'cython'
     subprocess.check_call([sys.executable, 'setup.py', 'build', 'install',
                            '--prefix', str(tmp_path / 'installdir'),
                            '--single-version-externally-managed',
                            '--record', str(tmp_path/ 'tmp_install_log.txt'),
                           ],
-                          cwd=str(tmp_path / 'random' / '_examples' / 'cython'),
-                          env=env)
+                          cwd=str(build_dir),
+                      )
+    # gh-16162: make sure numpy's __init__.pxd was used for cython
+    # not really part of this test, but it is a convenient place to check
+    with open(build_dir / 'extending.c') as fid:
+        txt_to_find = 'NumPy API declarations from "numpy/__init__.pxd"'
+        for i, line in enumerate(fid):
+            if txt_to_find in line:
+                break
+        else:
+            assert False, ("Could not find '{}' in C file, "
+                           "wrong pxd used".format(txt_to_find))
     # get the path to the so's
     so1 = so2 = None
     with open(tmp_path /'tmp_install_log.txt') as fid:
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index 08b44e4db..a28b7ca11 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -1103,6 +1103,31 @@ class TestRandomDist:
                                   size=(3, 2))
         assert_array_almost_equal(non_contig, contig)
 
+    def test_dirichlet_small_alpha(self):
+        eps = 1.0e-9  # 1.0e-10 -> runtime x 10; 1e-11 -> runtime x 200, etc.
+        alpha = eps * np.array([1., 1.0e-3])
+        random = Generator(MT19937(self.seed))
+        actual = random.dirichlet(alpha, size=(3, 2))
+        expected = np.array([
+            [[1., 0.],
+             [1., 0.]],
+            [[1., 0.],
+             [1., 0.]],
+            [[1., 0.],
+             [1., 0.]]
+        ])
+        assert_array_almost_equal(actual, expected, decimal=15)
+
+    @pytest.mark.slow
+    def test_dirichlet_moderately_small_alpha(self):
+        # Use alpha.max() < 0.1 to trigger stick breaking code path
+        alpha = np.array([0.02, 0.04, 0.03])
+        exact_mean = alpha / alpha.sum()
+        random = Generator(MT19937(self.seed))
+        sample = random.dirichlet(alpha, size=20000000)
+        sample_mean = sample.mean(axis=0)
+        assert_allclose(sample_mean, exact_mean, rtol=1e-3)
+
     def test_exponential(self):
         random = Generator(MT19937(self.seed))
         actual = random.exponential(1.1234, size=(3, 2))
diff --git a/numpy/setup.py b/numpy/setup.py
index fb9b36b78..52db6a68b 100644
--- a/numpy/setup.py
+++ b/numpy/setup.py
@@ -18,7 +18,7 @@ def configuration(parent_package='',top_path=None):
     config.add_subpackage('random')
     config.add_subpackage('testing')
     config.add_data_dir('doc')
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
     config.make_config_py() # installs __config__.py
     return config
 
diff --git a/numpy/testing/setup.py b/numpy/testing/setup.py
index f4970991c..13191f13f 100755
--- a/numpy/testing/setup.py
+++ b/numpy/testing/setup.py
@@ -5,7 +5,7 @@ def configuration(parent_package='',top_path=None):
     config = Configuration('testing', parent_package, top_path)
 
     config.add_subpackage('_private')
-    config.add_data_dir('tests')
+    config.add_subpackage('tests')
     return config
 
 if __name__ == '__main__':
diff --git a/runtests.py b/runtests.py
index e470f8a9d..7f1d55b85 100755
--- a/runtests.py
+++ b/runtests.py
@@ -125,7 +125,7 @@ def main(argv):
                               "COMMIT. Note that you need to commit your "
                               "changes first!"))
     parser.add_argument("args", metavar="ARGS", default=[], nargs=REMAINDER,
-                        help="Arguments to pass to Nose, Python or shell")
+                        help="Arguments to pass to Nose, asv, Python or shell")
     args = parser.parse_args(argv)
 
     if args.durations < 0:
@@ -162,8 +162,10 @@ def main(argv):
         site_dir = os.path.sep.join(_temp.__file__.split(os.path.sep)[:-2])
 
     extra_argv = args.args[:]
-    if extra_argv and extra_argv[0] == '--':
-        extra_argv = extra_argv[1:]
+    if not args.bench:
+        # extra_argv may also lists selected benchmarks
+        if extra_argv and extra_argv[0] == '--':
+            extra_argv = extra_argv[1:]
 
     if args.python:
         # Debugging issues with warnings is much easier if you can see them
@@ -220,13 +222,21 @@ def main(argv):
 
     if args.bench:
         # Run ASV
-        items = extra_argv
+        for i, v in enumerate(extra_argv):
+            if v.startswith("--"):
+                items = extra_argv[:i]
+                if v == "--":
+                    i += 1  # skip '--' indicating further are passed on.
+                bench_args = extra_argv[i:]
+                break
+        else:
+            items = extra_argv
+            bench_args = []
+
         if args.tests:
             items += args.tests
         if args.submodule:
             items += [args.submodule]
-
-        bench_args = []
         for a in items:
             bench_args.extend(['--bench', a])
 
diff --git a/setup.py b/setup.py
index a5aa1709f..594c3ed91 100755
--- a/setup.py
+++ b/setup.py
@@ -161,6 +161,7 @@ def configuration(parent_package='',top_path=None):
 
     config.add_subpackage('numpy')
     config.add_data_files(('numpy', 'LICENSE.txt'))
+    config.add_data_files(('numpy', 'numpy/__init__.pxd'))
 
     config.get_version('numpy/version.py') # sets config.version
 
diff --git a/shippable.yml b/shippable.yml
index 6985f38ef..dc3617e12 100644
--- a/shippable.yml
+++ b/shippable.yml
@@ -25,7 +25,6 @@ build:
     - sudo add-apt-repository ppa:ubuntu-toolchain-r/test
     - sudo apt-get update
     - sudo apt-get install gcc gfortran libgfortran5
-    - python -mpip install urllib3
     - target=$(python tools/openblas_support.py)
     - ls -lR "${target}"
     - sudo cp -r "${target}"/lib/* /usr/lib
diff --git a/test_requirements.txt b/test_requirements.txt
index adacdfcda..5db322b9b 100644
--- a/test_requirements.txt
+++ b/test_requirements.txt
@@ -1,7 +1,7 @@
 cython==0.29.17
-hypothesis==5.10.4
-pytest==5.4.1
-pytz==2019.3
+hypothesis==5.12.0
+pytest==5.4.2
+pytz==2020.1
 pytest-cov==2.8.1
 pickle5; python_version == '3.7'
 pickle5; python_version == '3.6' and platform_python_implementation != 'PyPy'
diff --git a/tools/openblas_support.py b/tools/openblas_support.py
index 105aae51f..984dea35a 100644
--- a/tools/openblas_support.py
+++ b/tools/openblas_support.py
@@ -1,22 +1,42 @@
+import glob
+import hashlib
 import os
+import platform
 import sys
-import glob
 import shutil
+import tarfile
 import textwrap
-import platform
+import zipfile
 
 from tempfile import mkstemp, gettempdir
-import zipfile
-import tarfile
+from urllib.request import urlopen, Request
 
 OPENBLAS_V = '0.3.9'
 # Temporary build of OpenBLAS to test a fix for dynamic detection of CPU
 OPENBLAS_LONG = 'v0.3.7-527-g79fd006c'  # the 0.3.7 is misleading
 BASE_LOC = 'https://anaconda.org/multibuild-wheels-staging/openblas-libs'
 BASEURL = f'{BASE_LOC}/{OPENBLAS_LONG}/download'
-ARCHITECTURES = ['', 'windows', 'darwin', 'aarch64', 'x86', 'ppc64le', 's390x']
+ARCHITECTURES = ['', 'windows', 'darwin', 'aarch64', 'x86_64', 'i686', 'ppc64le', 's390x']
+sha256_vals = {
+"openblas-v0.3.7-527-g79fd006c-win_amd64-gcc_7_1_0.zip": "7249d68c02e6b6339e06edfeab1fecddf29ee1e67a3afaa77917c320c43de840",
+"openblas64_-v0.3.7-527-g79fd006c-win_amd64-gcc_7_1_0.zip": "6488e0961a5926e47242f63b63b41cfdd661e6f1d267e8e313e397cde4775c17",
+"openblas-v0.3.7-527-g79fd006c-win32-gcc_7_1_0.zip": "5fb0867ca70b1d0fdbf68dd387c0211f26903d74631420e4aabb49e94aa3930d",
+"openblas-v0.3.7-527-g79fd006c-macosx_10_9_x86_64-gf_1becaaa.tar.gz": "69434bd626bbc495da9ce8c36b005d140c75e3c47f94e88c764a199e820f9259",
+"openblas64_-v0.3.7-527-g79fd006c-macosx_10_9_x86_64-gf_1becaaa.tar.gz": "093f6d953e3fa76a86809be67bd1f0b27656671b5a55b233169cfaa43fd63e22",
+"openblas-v0.3.7-527-g79fd006c-manylinux2014_aarch64.tar.gz": "42676c69dc48cd6e412251b39da6b955a5a0e00323ddd77f9137f7c259d35319",
+"openblas64_-v0.3.7-527-g79fd006c-manylinux2014_aarch64.tar.gz": "5aec167af4052cf5e9e3e416c522d9794efabf03a2aea78b9bb3adc94f0b73d8",
+"openblas-v0.3.7-527-g79fd006c-manylinux2010_x86_64.tar.gz": "fa67c6cc29d4cc5c70a147c80526243239a6f95fc3feadcf83a78176cd9c526b",
+"openblas64_-v0.3.7-527-g79fd006c-manylinux2010_x86_64.tar.gz": "9ad34e89a5307dcf5823bf5c020580d0559a0c155fe85b44fc219752e61852b0",
+"openblas-v0.3.7-527-g79fd006c-manylinux2010_i686.tar.gz": "0b8595d316c8b7be84ab1f1d5a0c89c1b35f7c987cdaf61d441bcba7ab4c7439",
+"openblas-v0.3.7-527-g79fd006c-manylinux2014_ppc64le.tar.gz": "3e1c7d6472c34e7210e3605be4bac9ddd32f613d44297dc50cf2d067e720c4a9",
+"openblas64_-v0.3.7-527-g79fd006c-manylinux2014_ppc64le.tar.gz": "a0885873298e21297a04be6cb7355a585df4fa4873e436b4c16c0a18fc9073ea",
+"openblas-v0.3.7-527-g79fd006c-manylinux2014_s390x.tar.gz": "79b454320817574e20499d58f05259ed35213bea0158953992b910607b17f240",
+"openblas64_-v0.3.7-527-g79fd006c-manylinux2014_s390x.tar.gz": "9fddbebf5301518fc4a5d2022a61886544a0566868c8c014359a1ee6b17f2814",
+}
+
 
 IS_32BIT = sys.maxsize < 2**32
+
 def get_arch():
     if platform.system() == 'Windows':
         ret = 'windows'
@@ -25,10 +45,10 @@ def get_arch():
     else:
         ret = platform.uname().machine
         # What do 32 bit machines report?
-        # If they are a docker, they report x86_64 or i686
-        if 'x86' in ret or ret == 'i686':
-            ret = 'x86'
-    assert ret in ARCHITECTURES
+        # If they are a docker, they can report x86_64
+        if 'x86' in ret and IS_32BIT:
+            arch = 'i686'
+    assert ret in ARCHITECTURES, f'invalid architecture {ret}'
     return ret
 
 def get_ilp64():
@@ -38,15 +58,28 @@ def get_ilp64():
         raise RuntimeError("NPY_USE_BLAS_ILP64 set on 32-bit arch")
     return "64_"
 
+def get_manylinux(arch):
+    if arch in ('x86_64', 'i686'):
+        default = '2010'
+    else:
+        default = '2014'
+    ret = os.environ.get("MB_ML_VER", default)
+    # XXX For PEP 600 this can be a glibc version
+    assert ret in ('1', '2010', '2014'), f'invalid MB_ML_VER {ret}'
+    return ret
+
+
 def download_openblas(target, arch, ilp64):
-    import urllib3
+    ml_ver = get_manylinux(arch)
     fnsuffix = {None: "", "64_": "64_"}[ilp64]
     filename = ''
-    if arch in ('aarch64', 'ppc64le', 's390x'):
-        suffix = f'manylinux2014_{arch}.tar.gz'
+    headers = {'User-Agent': ('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 ; '
+                              '(KHTML, like Gecko) Chrome/41.0.2228.0 '
+                              'Safari/537.3')}
+    if arch in ('aarch64', 'ppc64le', 's390x', 'x86_64', 'i686'):
+        suffix = f'manylinux{ml_ver}_{arch}.tar.gz'
         filename = f'{BASEURL}/openblas{fnsuffix}-{OPENBLAS_LONG}-{suffix}'
         typ = 'tar.gz'
-        typ = 'tar.gz'
     elif arch == 'darwin':
         suffix = 'macosx_10_9_x86_64-gf_1becaaa.tar.gz'
         filename = f'{BASEURL}/openblas{fnsuffix}-{OPENBLAS_LONG}-{suffix}'
@@ -58,24 +91,28 @@ def download_openblas(target, arch, ilp64):
             suffix = 'win_amd64-gcc_7_1_0.zip'
         filename = f'{BASEURL}/openblas{fnsuffix}-{OPENBLAS_LONG}-{suffix}'
         typ = 'zip'
-    elif 'x86' in arch:
-        if IS_32BIT:
-            suffix = 'manylinux2010_i686.tar.gz'
-        else:
-            suffix = 'manylinux2010_x86_64.tar.gz'
-        filename = f'{BASEURL}/openblas{fnsuffix}-{OPENBLAS_LONG}-{suffix}'
-        typ = 'tar.gz'
     if not filename:
         return None
-    print("Downloading:", filename, file=sys.stderr)
-    http = urllib3.PoolManager()
-    response = http.request('GET', filename)
+    req = Request(url=filename, headers=headers)
+    response = urlopen(req)
+    length = response.getheader('content-length')
     if response.status != 200:
         print(f'Could not download "{filename}"', file=sys.stderr)
         return None
+    print(f"Downloading {length} from {filename}", file=sys.stderr)
+    data = response.read()
+    # Verify hash
+    key = os.path.basename(filename)
+    sha256_returned = hashlib.sha256(data).hexdigest()
+    if key not in sha256_vals:
+        raise ValueError(
+            f'key "{key}" with hash "{sha256_returned}" not in sha256_vals')
+    sha256_expected = sha256_vals[key]
+    if sha256_returned != sha256_expected:
+        raise ValueError(f'sha256 hash mismatch for filename {filename}')
     print("Saving to file", file=sys.stderr)
     with open(target, 'wb') as fid:
-        fid.write(response.data)
+        fid.write(data)
     return typ
 
 def setup_openblas(arch=get_arch(), ilp64=get_ilp64()):
@@ -197,9 +234,10 @@ def test_setup(arches):
     def items():
         for arch in arches:
             yield arch, None
-            if arch in ('x86', 'darwin', 'windows'):
+            if arch not in ('i686'):
                 yield arch, '64_'
 
+    errs = []
     for arch, ilp64 in items():
         if arch == '':
             continue
@@ -208,9 +246,11 @@ def test_setup(arches):
         try:
             try:
                 target = setup_openblas(arch, ilp64)
-            except:
-                print(f'Could not setup {arch}')
-                raise
+            except Exception as e:
+                print(f'Could not setup {arch}:')
+                print(str(e))
+                errs.append(e)
+                continue
             if not target:
                 raise RuntimeError(f'Could not setup {arch}')
             print(target)
@@ -227,6 +267,9 @@ def test_setup(arches):
                     os.unlink(target)
                 else:
                     shutil.rmtree(target)
+    if errs:
+        raise errs[0]
+
 
 def test_version(expected_version, ilp64=get_ilp64()):
     """
diff --git a/tools/pypy-test.sh b/tools/pypy-test.sh
index e98c12587..e24d7a99d 100755
--- a/tools/pypy-test.sh
+++ b/tools/pypy-test.sh
@@ -7,8 +7,9 @@ set -o pipefail
 set -x
 
 sudo apt-get -yq update
-sudo apt-get -yq install libatlas-base-dev liblapack-dev gfortran-5 python3-urllib3
-F77=gfortran-5 F90=gfortran-5 \
+sudo apt-get -yq install gfortran-5
+export F77=gfortran-5
+export F90=gfortran-5
 
 # Download the proper OpenBLAS x64 precompiled library
 target=$(python3 tools/openblas_support.py)
@@ -27,8 +28,8 @@ include_dirs = $target/lib:$LIB
 runtime_library_dirs = $target/lib
 EOF
 
-echo getting PyPy 3.6 nightly
-wget -q http://buildbot.pypy.org/nightly/py3.6/pypy-c-jit-latest-linux64.tar.bz2 -O pypy.tar.bz2
+echo getting PyPy 3.6-v7.3.1
+wget -q https://downloads.python.org/pypy/pypy3.6-v7.3.1-linux64.tar.bz2 -O pypy.tar.bz2
 mkdir -p pypy3
 (cd pypy3; tar --strip-components=1 -xf ../pypy.tar.bz2)
 pypy3/bin/pypy3 -mensurepip
diff --git a/tools/refguide_check.py b/tools/refguide_check.py
index e6cfc8b77..31d2997d3 100644
--- a/tools/refguide_check.py
+++ b/tools/refguide_check.py
@@ -450,7 +450,7 @@ def validate_rst_syntax(text, name, dots=True):
         return False, "ERROR: %s: no documentation" % (name,)
 
     ok_unknown_items = set([
-        'mod', 'currentmodule', 'autosummary', 'data', 'attr',
+        'mod', 'doc', 'currentmodule', 'autosummary', 'data', 'attr',
         'obj', 'versionadded', 'versionchanged', 'module', 'class',
         'ref', 'func', 'toctree', 'moduleauthor', 'term', 'c:member',
         'sectionauthor', 'codeauthor', 'eq', 'doi', 'DOI', 'arXiv', 'arxiv'
diff --git a/tools/travis-before-install.sh b/tools/travis-before-install.sh
index dbe2f6ea2..e468dd932 100755
--- a/tools/travis-before-install.sh
+++ b/tools/travis-before-install.sh
@@ -41,8 +41,7 @@ pip install --upgrade pip
 # A specific version of cython is required, so we read the cython package
 # requirement using `grep cython test_requirements.txt` instead of simply
 # writing 'pip install setuptools wheel cython'.
-# urllib3 is needed for openblas_support
-pip install setuptools wheel urllib3 `grep cython test_requirements.txt`
+pip install setuptools wheel `grep cython test_requirements.txt`
 
 if [ -n "$DOWNLOAD_OPENBLAS" ]; then
   pwd