90 files changed, 3796 insertions, 1499 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
deleted file mode 100644
index c8c1795c1..000000000
--- a/.appveyor.yml
+++ /dev/null
@@ -1,150 +0,0 @@
-# As config was originally based on an example by Olivier Grisel. Thanks!
-# https://github.com/ogrisel/python-appveyor-demo/blob/master/appveyor.yml
-clone_depth: 50
-
-# No reason for us to restrict the number concurrent jobs
-max_jobs: 100
-
-cache:
-  - '%LOCALAPPDATA%\pip\Cache'
-
-environment:
-  global:
-      MINGW_32: C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin
-      MINGW_64: C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin
-      OPENBLAS_32: https://3f23b170c54c2533c070-1c8a9b3114517dc5fe17b7c3f8c63a43.ssl.cf2.rackcdn.com/openblas-5f998ef_gcc7_1_0_win32.zip
-      OPENBLAS_64: https://3f23b170c54c2533c070-1c8a9b3114517dc5fe17b7c3f8c63a43.ssl.cf2.rackcdn.com/openblas-5f998ef_gcc7_1_0_win64.zip
-      APPVEYOR_SAVE_CACHE_ON_ERROR: true
-      APPVEYOR_SKIP_FINALIZE_ON_EXIT: true
-      TEST_TIMEOUT: 1000
-      NPY_NUM_BUILD_JOBS: 4
-
-  matrix:
-    - PYTHON: C:\Python36
-      PYTHON_VERSION: 3.6
-      PYTHON_ARCH: 32
-      TEST_MODE: fast
-
-    - PYTHON: C:\Python37
-      PYTHON_VERSION: 3.7
-      PYTHON_ARCH: 32
-      TEST_MODE: fast
-
-    - PYTHON: C:\Python36-x64
-      PYTHON_VERSION: 3.6
-      PYTHON_ARCH: 64
-      TEST_MODE: full
-
-    - PYTHON: C:\Python37-x64
-      PYTHON_VERSION: 3.7
-      PYTHON_ARCH: 64
-      TEST_MODE: full
-
-init:
-  - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%"
-  - "ECHO \"%APPVEYOR_SCHEDULED_BUILD%\""
-  # If there is a newer build queued for the same PR, cancel this one.
-  # The AppVeyor 'rollout builds' option is supposed to serve the same
-  # purpose but it is problematic because it tends to cancel builds pushed
-  # directly to master instead of just PR builds (or the converse).
-  # credits: JuliaLang developers.
-  - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod `
-        https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | `
-        Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { `
-          raise "There are newer queued builds for this pull request, skipping build."
-        }
-
-install:
-  # Prepend newly installed Python to the PATH of this build (this cannot be
-  # done from inside the powershell script as it would require to restart
-  # the parent CMD process).
-  - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH%
-  - if [%PYTHON_ARCH%]==[32] SET PATH=%MINGW_32%;%PATH% & SET OPENBLAS=%OPENBLAS_32%
-  - if [%PYTHON_ARCH%]==[64] SET PATH=%MINGW_64%;%PATH% & SET OPENBLAS=%OPENBLAS_64%
-
-  # Check that we have the expected version and architecture for Python
-  - python --version
-  - >-
-     %CMD_IN_ENV%
-     python -c "import sys,platform,struct;
-     print(sys.platform, platform.machine(), struct.calcsize('P') * 8, )"
-
-  # Install "openblas.a" to PYTHON\lib
-  # Library provided by Matthew Brett at https://github.com/matthew-brett/build-openblas
-  - ps: |
-      $clnt = new-object System.Net.WebClient
-      $file = "$(New-TemporaryFile).zip"
-      $tmpdir = New-TemporaryFile | %{ rm $_; mkdir $_ }
-      $destination = "$env:PYTHON\lib\openblas.a"
-
-      echo $file
-      echo $tmpdir
-      echo $env:OPENBLAS
-
-      $clnt.DownloadFile($env:OPENBLAS, $file)
-      Get-FileHash $file | Format-List
-
-      Expand-Archive $file $tmpdir
-
-      rm $tmpdir\$env:PYTHON_ARCH\lib\*.dll.a
-      $lib = ls $tmpdir\$env:PYTHON_ARCH\lib\*.a | ForEach { ls $_ } | Select-Object -first 1
-      echo $lib
-
-      cp $lib $destination
-      ls $destination
-
-  # Upgrade to the latest pip.
-  - 'python -m pip install -U pip setuptools wheel'
-
-  # Install the numpy test dependencies.
-  - 'pip install -U --timeout 5 --retries 2 -r test_requirements.txt'
-
-build_script:
-  # Here, we add MinGW to the path to be able to link an OpenBLAS.dll
-  # We then use the import library from the DLL to compile with MSVC
-  - ps: |
-      pip wheel -v -v -v --wheel-dir=dist .
-
-      # For each wheel that pip has placed in the "dist" directory
-      # First, upload the wheel to the "artifacts" tab and then
-      # install the wheel. If we have only built numpy (as is the case here),
-      # then there will be one wheel to install.
-
-      # This method is more representative of what will be distributed,
-      # because it actually tests what the built wheels will be rather than
-      # what 'setup.py install' will do and at it uploads the wheels so that
-      # they can be inspected.
-
-      ls dist -r | Foreach-Object {
-          Push-AppveyorArtifact $_.FullName
-          pip install $_.FullName
-      }
-
-test_script:
-  python runtests.py -v -n -m %TEST_MODE% -- --junitxml=%cd%\junit-results.xml
-
-after_build:
-  # Remove old or huge cache files to hopefully not exceed the 1GB cache limit.
-  #
-  # If the cache limit is reached, the cache will not be updated (of not even
-  # created in the first run). So this is a trade of between keeping the cache
-  # current and having a cache at all.
-  # NB: This is done only `on_success` since the cache in uploaded only on
-  # success anyway.
-  - C:\cygwin\bin\find "%LOCALAPPDATA%\pip" -type f -mtime +360 -delete
-  - C:\cygwin\bin\find "%LOCALAPPDATA%\pip" -type f -size +10M -delete
-  - C:\cygwin\bin\find "%LOCALAPPDATA%\pip" -empty -delete
-  # Show size of cache
-  - C:\cygwin\bin\du -hs "%LOCALAPPDATA%\pip\Cache"
-
-on_finish:
-  # We can get a nice display of test results in the "test" tab with py.test
-  # For now, this does nothing.
-  - ps: |
-      If (Test-Path .\junit-results.xml) {
-        (new-object net.webclient).UploadFile(
-          "https://ci.appveyor.com/api/testresults/junit/$($env:APPVEYOR_JOB_ID)",
-          (Resolve-Path .\junit-results.xml)
-        )
-      }
-      $LastExitCode = 0
diff --git a/.codecov.yml b/.codecov.yml
index 35584a188..d92d54c9d 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -1,6 +1,4 @@
 codecov:
-  ci:
-    - !appveyor
   notify:
     require_ci_to_pass: no
     after_n_builds: 1
diff --git a/.travis.yml b/.travis.yml
index 714122957..68564d35b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,6 +18,14 @@ cache:
   directories:
     - $HOME/.cache/pip
 
+stage: Comprehensive tests
+
+stages:
+    # Do the style check and a single test job, don't proceed if it fails
+    - name: Initial tests
+    # Do the rest of the tests
+    - name: Comprehensive tests
+
 env:
   global:
     - OpenBLAS_version=0.3.7
@@ -29,13 +37,14 @@ env:
                iFWt9Ka92CaqYdU7nqfWp9VImSndPmssjmCXJ1v1IjZPAM\
                ahp7Qnm0rWRmA0z9SomuRUQOJQ6s684vU="
 
-python:
-  - 3.5
-  - 3.6
-  - 3.7
-  - 3.8-dev
 matrix:
   include:
+    # Do all python versions without environment variables set
+    - python: 3.5
+    - stage: Initial tests
+      python: 3.6
+    - python: 3.7
+    - python: 3.8-dev
     - python: 3.7
       env: INSTALL_PICKLE5=1
     - python: 3.6
diff --git a/MANIFEST.in b/MANIFEST.in
index 7ab57eb8c..b58f85d4d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -15,6 +15,7 @@ include tox.ini
 include .coveragerc
 include test_requirements.txt
 recursive-include numpy/random *.pyx *.pxd *.pyx.in *.pxd.in
+include numpy/random/include/*
 include numpy/__init__.pxd
 # Add build support that should go in sdist, but not go in bdist/be installed
 # Note that sub-directories that don't have __init__ are apparently not
diff --git a/README.md b/README.md
index 46fff43a0..0599c46f7 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,6 @@
 
 [![Travis](https://img.shields.io/travis/numpy/numpy/master.svg?label=Travis%20CI)](
     https://travis-ci.org/numpy/numpy)
-[![AppVeyor](https://img.shields.io/appveyor/ci/charris/numpy/master.svg?label=AppVeyor)](
-    https://ci.appveyor.com/project/charris/numpy)
 [![Azure](https://dev.azure.com/numpy/numpy/_apis/build/status/azure-pipeline%20numpy.numpy)](
     https://dev.azure.com/numpy/numpy/_build/latest?definitionId=5)
 [![codecov](https://codecov.io/gh/numpy/numpy/branch/master/graph/badge.svg)](
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index ebc45ca96..633808c0b 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -10,212 +10,173 @@ variables:
   # to match numpy-wheels repo
   OpenBLAS_version: 0.3.7
 
-jobs:
-- job: Linux_Python_36_32bit_full_with_asserts
-  pool:
-    vmImage: 'ubuntu-16.04'
-  steps:
-  - script: |
-           docker pull i386/ubuntu:bionic
-           docker run -v $(pwd):/numpy i386/ubuntu:bionic /bin/bash -c "cd numpy && \
-           apt-get -y update && \
-           apt-get -y install python3.6-dev python3-pip locales python3-certifi && \
-           locale-gen fr_FR && update-locale && \
-           apt-get -y install gfortran-5 wget && \
-           target=\$(python3 tools/openblas_support.py) && \
-           cp -r \$target/usr/local/lib/* /usr/lib && \
-           cp \$target/usr/local/include/* /usr/include && \
-           python3 -m pip install --user --upgrade pip setuptools && \
-           python3 -m pip install --user -r test_requirements.txt && \
-           python3 -m pip install . && \
-           F77=gfortran-5 F90=gfortran-5 \
-           CFLAGS='-UNDEBUG -std=c99' python3 runtests.py -n --debug-info --mode=full -- -rsx --junitxml=junit/test-results.xml && \
-           python3 tools/openblas_support.py --check_version $(OpenBLAS_version)"
-    displayName: 'Run 32-bit Ubuntu Docker Build / Tests'
-  - task: PublishTestResults@2
-    condition: succeededOrFailed()
-    inputs:
-      testResultsFiles: '**/test-*.xml'
-      failTaskOnFailedTests: true
-      testRunTitle: 'Publish test results for Python 3.6-32 bit full Linux'
-- job: macOS
-  pool:
-    # NOTE: at time of writing, there is a danger
-    # that using an invalid vmIMage string for macOS
-    # image silently redirects to a Windows build on Azure;
-    # for now, use the only image name officially present in
-    # the docs even though i.e., numba uses another in their
-    # azure config for mac os -- Microsoft has indicated
-    # they will patch this issue
-    vmImage: macOS-10.13
-  steps:
-  # the @0 refers to the (major) version of the *task* on Microsoft's
-  # end, not the order in the build matrix nor anything to do
-  # with version of Python selected
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '3.6'
-      addToPath: true
-      architecture: 'x64'
-  # NOTE: do we have a compelling reason to use older / newer
-  # versions of Xcode toolchain for testing?
-  - script: /bin/bash -c "sudo xcode-select -s /Applications/Xcode_10.app/Contents/Developer"
-    displayName: 'select Xcode version'
-  # NOTE: might be better if we could avoid installing
-  # two C compilers, but with homebrew looks like we're
-  # now stuck getting the full gcc toolchain instead of
-  # just pulling in gfortran
-  - script: |
-      # same version of gfortran as the wheel builds
-      brew install gcc49
-      # manually link critical gfortran libraries
-      ln -s /usr/local/Cellar/gcc@4.9/4.9.4_1/lib/gcc/4.9/libgfortran.3.dylib /usr/local/lib/libgfortran.3.dylib
-      ln -s /usr/local/Cellar/gcc@4.9/4.9.4_1/lib/gcc/4.9/libquadmath.0.dylib /usr/local/lib/libquadmath.0.dylib
-      # manually symlink gfortran-4.9 to plain gfortran
-      # for f2py
-      ln -s /usr/local/bin/gfortran-4.9 /usr/local/bin/gfortran
-    displayName: 'make gfortran available on mac os vm'
-  # use the pre-built openblas binary that most closely
-  # matches our MacOS wheel builds -- currently based
-  # primarily on file size / name details
-  - script: |
-      target=$(python tools/openblas_support.py)
-      # manually link to appropriate system paths
-      cp $target/usr/local/lib/* /usr/local/lib/
-      cp $target/usr/local/include/* /usr/local/include/
-    displayName: 'install pre-built openblas'
-  - script: python -m pip install --upgrade pip setuptools wheel
-    displayName: 'Install tools'
-  - script: |
-      python -m pip install -r test_requirements.txt
-      python -m pip install vulture docutils sphinx==2.2.0 numpydoc
-    displayName: 'Install dependencies; some are optional to avoid test skips'
-  - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'"
-    displayName: 'Check for unreachable code paths in Python modules'
-  # prefer usage of clang over gcc proper
-  # to match likely scenario on many user mac machines
-  - script: python setup.py build -j 4 build_src --verbose-cfg install
-    displayName: 'Build NumPy'
-    env:
-      BLAS: None
-      LAPACK: None
-      ATLAS: None
-      ACCELERATE: None
-      CC: /usr/bin/clang
-  # wait until after dev build of NumPy to pip
-  # install matplotlib to avoid pip install of older numpy
-  - script: python -m pip install matplotlib
-    displayName: 'Install matplotlib before refguide run'
-  - script: python runtests.py -g --refguide-check
-    displayName: 'Run Refuide Check'
-  - script: python runtests.py -n --mode=full -- -rsx --junitxml=junit/test-results.xml
-    displayName: 'Run Full NumPy Test Suite'
-  - bash: python tools/openblas_support.py --check_version $(OpenBLAS_version)
-    displayName: 'Verify OpenBLAS version'
-  - task: PublishTestResults@2
-    condition: succeededOrFailed()
-    inputs:
-      testResultsFiles: '**/test-*.xml'
-      failTaskOnFailedTests: true
-      testRunTitle: 'Publish test results for Python 3.6 64-bit full Mac OS'
-- job: Windows
-  pool:
-    vmImage: 'VS2017-Win2016'
-  strategy:
-    maxParallel: 6
-    matrix:
-        Python36-32bit-fast:
-          PYTHON_VERSION: '3.6'
-          PYTHON_ARCH: 'x86'
-          TEST_MODE: fast
-          BITS: 32
-        Python37-32bit-fast:
-          PYTHON_VERSION: '3.7'
-          PYTHON_ARCH: 'x86'
-          TEST_MODE: fast
-          BITS: 32
-        Python35-64bit-full:
-          PYTHON_VERSION: '3.5'
-          PYTHON_ARCH: 'x64'
-          TEST_MODE: full
-          BITS: 64
-        Python36-64bit-full:
-          PYTHON_VERSION: '3.6'
-          PYTHON_ARCH: 'x64'
-          TEST_MODE: full
-          BITS: 64
-        Python37-64bit-full:
-          PYTHON_VERSION: '3.7'
-          PYTHON_ARCH: 'x64'
-          TEST_MODE: full
-          BITS: 64
-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: $(PYTHON_VERSION)
-      addToPath: true
-      architecture: $(PYTHON_ARCH)
-  - script: python -m pip install --upgrade pip setuptools wheel
-    displayName: 'Install tools'
-  - script: python -m pip install -r test_requirements.txt
-    displayName: 'Install dependencies; some are optional to avoid test skips'
-  - powershell: |
-      $pyversion = python -c "from __future__ import print_function; import sys; print(sys.version.split()[0])"
-      Write-Host "Python Version: $pyversion"
-      $target = "C:\\hostedtoolcache\\windows\\Python\\$pyversion\\$(PYTHON_ARCH)\\lib\\openblas.a"
-      Write-Host "target path: $target"
-      $openblas = python tools/openblas_support.py
-      cp $openblas $target
-    displayName: 'Download / Install OpenBLAS'
-
-  - powershell: |
-      choco install -y mingw --forcex86 --force --version=5.3.0
-    displayName: 'Install 32-bit mingw for 32-bit builds'
-    condition: eq(variables['BITS'], 32)
-  # NOTE: for Windows builds it seems much more tractable to use runtests.py
-  # vs. manual setup.py and then runtests.py for testing only
-  - powershell: |
-      If ($(BITS) -eq 32) {
-         $env:CFLAGS = "-m32"
-         $env:LDFLAGS = "-m32"
-         $env:PATH = "C:\\tools\\mingw32\\bin;" + $env:PATH
-         refreshenv
-      }
-      python -c "from tools import openblas_support; openblas_support.make_init('numpy')"
-      pip wheel -v -v -v --wheel-dir=dist .
-
-      ls dist -r | Foreach-Object {
-          pip install $_.FullName
-      }
-    displayName: 'Build NumPy'
-  - bash: |
-      pushd . && cd .. && target=$(python -c "import numpy, os; print(os.path.abspath(os.path.join(os.path.dirname(numpy.__file__), '.libs')))") && popd
-      pip download -d destination --only-binary --no-deps numpy==1.14
-      cd destination && unzip numpy*.whl && cp numpy/.libs/*.dll $target
-      ls $target
-    displayName: 'Add extraneous & older DLL to numpy/.libs to probe DLL handling robustness'
-    condition: eq(variables['PYTHON_VERSION'], '3.6')
-  - script: pushd . && cd .. && python -c "from ctypes import windll; windll.kernel32.SetDefaultDllDirectories(0x00000800); import numpy" && popd
-    displayName: 'For gh-12667; Windows DLL resolution'
-  - script: python runtests.py -n --show-build-log --mode=$(TEST_MODE) -- -rsx --junitxml=junit/test-results.xml
-    displayName: 'Run NumPy Test Suite'
-  - task: PublishTestResults@2
-    condition: succeededOrFailed()
-    inputs:
-      testResultsFiles: '**/test-*.xml'
-      failTaskOnFailedTests: true
-      testRunTitle: 'Publish test results for Python $(PYTHON_VERSION) $(BITS)-bit $(TEST_MODE) Windows'
-
-- job: Linux_PyPy3
-  pool:
-    vmIMage: 'ubuntu-16.04'
-  steps:
-  - script: source tools/pypy-test.sh
-    displayName: 'Run PyPy3 Build / Tests'
-  - task: PublishTestResults@2
-    condition: succeededOrFailed()
-    inputs:
-      testResultsFiles: '**/test-*.xml'
-      testRunTitle: 'Publish test results for PyPy3'
-      failTaskOnFailedTests: true
+stages:
+- stage: InitialTests
+  jobs:
+  - job: WindowsFast
+    pool:
+      vmImage: 'VS2017-Win2016'
+    strategy:
+      matrix:
+          Python36-64bit-fast:
+            PYTHON_VERSION: '3.6'
+            PYTHON_ARCH: 'x64'
+            TEST_MODE: fast
+            BITS: 64
+    steps:
+    - template: azure-steps-windows.yml
 
+- stage: ComprehensiveTests
+  jobs:
+  - job: Linux_Python_36_32bit_full_with_asserts
+    pool:
+      vmImage: 'ubuntu-16.04'
+    steps:
+    - script: |
+            docker pull i386/ubuntu:bionic
+            docker run -v $(pwd):/numpy i386/ubuntu:bionic /bin/bash -c "cd numpy && \
+            apt-get -y update && \
+            apt-get -y install python3.6-dev python3-pip locales python3-certifi && \
+            locale-gen fr_FR && update-locale && \
+            apt-get -y install gfortran-5 wget && \
+            target=\$(python3 tools/openblas_support.py) && \
+            cp -r \$target/usr/local/lib/* /usr/lib && \
+            cp \$target/usr/local/include/* /usr/include && \
+            python3 -m pip install --user --upgrade pip setuptools && \
+            python3 -m pip install --user -r test_requirements.txt && \
+            python3 -m pip install . && \
+            F77=gfortran-5 F90=gfortran-5 \
+            CFLAGS='-UNDEBUG -std=c99' python3 runtests.py -n --debug-info --mode=full -- -rsx --junitxml=junit/test-results.xml && \
+            python3 tools/openblas_support.py --check_version $(OpenBLAS_version)"
+      displayName: 'Run 32-bit Ubuntu Docker Build / Tests'
+    - task: PublishTestResults@2
+      condition: succeededOrFailed()
+      inputs:
+        testResultsFiles: '**/test-*.xml'
+        failTaskOnFailedTests: true
+        testRunTitle: 'Publish test results for Python 3.6-32 bit full Linux'
+  - job: macOS
+    pool:
+      # NOTE: at time of writing, there is a danger
+      # that using an invalid vmIMage string for macOS
+      # image silently redirects to a Windows build on Azure;
+      # for now, use the only image name officially present in
+      # the docs even though i.e., numba uses another in their
+      # azure config for mac os -- Microsoft has indicated
+      # they will patch this issue
+      vmImage: macOS-10.13
+    steps:
+    # the @0 refers to the (major) version of the *task* on Microsoft's
+    # end, not the order in the build matrix nor anything to do
+    # with version of Python selected
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: '3.6'
+        addToPath: true
+        architecture: 'x64'
+    # NOTE: do we have a compelling reason to use older / newer
+    # versions of Xcode toolchain for testing?
+    - script: /bin/bash -c "sudo xcode-select -s /Applications/Xcode_10.app/Contents/Developer"
+      displayName: 'select Xcode version'
+    # NOTE: might be better if we could avoid installing
+    # two C compilers, but with homebrew looks like we're
+    # now stuck getting the full gcc toolchain instead of
+    # just pulling in gfortran
+    - script: |
+        # same version of gfortran as the wheel builds
+        brew install gcc49
+        # manually link critical gfortran libraries
+        ln -s /usr/local/Cellar/gcc@4.9/4.9.4_1/lib/gcc/4.9/libgfortran.3.dylib /usr/local/lib/libgfortran.3.dylib
+        ln -s /usr/local/Cellar/gcc@4.9/4.9.4_1/lib/gcc/4.9/libquadmath.0.dylib /usr/local/lib/libquadmath.0.dylib
+        # manually symlink gfortran-4.9 to plain gfortran
+        # for f2py
+        ln -s /usr/local/bin/gfortran-4.9 /usr/local/bin/gfortran
+      displayName: 'make gfortran available on mac os vm'
+    # use the pre-built openblas binary that most closely
+    # matches our MacOS wheel builds -- currently based
+    # primarily on file size / name details
+    - script: |
+        target=$(python tools/openblas_support.py)
+        # manually link to appropriate system paths
+        cp $target/usr/local/lib/* /usr/local/lib/
+        cp $target/usr/local/include/* /usr/local/include/
+      displayName: 'install pre-built openblas'
+    - script: python -m pip install --upgrade pip setuptools wheel
+      displayName: 'Install tools'
+    - script: |
+        python -m pip install -r test_requirements.txt
+        python -m pip install vulture docutils sphinx==2.2.0 numpydoc
+      displayName: 'Install dependencies; some are optional to avoid test skips'
+    - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'"
+      displayName: 'Check for unreachable code paths in Python modules'
+    # prefer usage of clang over gcc proper
+    # to match likely scenario on many user mac machines
+    - script: python setup.py build -j 4 build_src --verbose-cfg install
+      displayName: 'Build NumPy'
+      env:
+        BLAS: None
+        LAPACK: None
+        ATLAS: None
+        ACCELERATE: None
+        CC: /usr/bin/clang
+    # wait until after dev build of NumPy to pip
+    # install matplotlib to avoid pip install of older numpy
+    - script: python -m pip install matplotlib
+      displayName: 'Install matplotlib before refguide run'
+    - script: python runtests.py -g --refguide-check
+      displayName: 'Run Refuide Check'
+    - script: python runtests.py -n --mode=full -- -rsx --junitxml=junit/test-results.xml
+      displayName: 'Run Full NumPy Test Suite'
+    - bash: python tools/openblas_support.py --check_version $(OpenBLAS_version)
+      displayName: 'Verify OpenBLAS version'
+    - task: PublishTestResults@2
+      condition: succeededOrFailed()
+      inputs:
+        testResultsFiles: '**/test-*.xml'
+        failTaskOnFailedTests: true
+        testRunTitle: 'Publish test results for Python 3.6 64-bit full Mac OS'
+  - job: Windows
+    pool:
+      vmImage: 'VS2017-Win2016'
+    strategy:
+      maxParallel: 6
+      matrix:
+          Python36-32bit-fast:
+            PYTHON_VERSION: '3.6'
+            PYTHON_ARCH: 'x86'
+            TEST_MODE: fast
+            BITS: 32
+          Python37-32bit-fast:
+            PYTHON_VERSION: '3.7'
+            PYTHON_ARCH: 'x86'
+            TEST_MODE: fast
+            BITS: 32
+          Python35-64bit-full:
+            PYTHON_VERSION: '3.5'
+            PYTHON_ARCH: 'x64'
+            TEST_MODE: full
+            BITS: 64
+          Python36-64bit-full:
+            PYTHON_VERSION: '3.6'
+            PYTHON_ARCH: 'x64'
+            TEST_MODE: full
+            BITS: 64
+          Python37-64bit-full:
+            PYTHON_VERSION: '3.7'
+            PYTHON_ARCH: 'x64'
+            TEST_MODE: full
+            BITS: 64
+    steps:
+    - template: azure-steps-windows.yml
+  - job: Linux_PyPy3
+    pool:
+      vmIMage: 'ubuntu-16.04'
+    steps:
+    - script: source tools/pypy-test.sh
+      displayName: 'Run PyPy3 Build / Tests'
+    - task: PublishTestResults@2
+      condition: succeededOrFailed()
+      inputs:
+        testResultsFiles: '**/test-*.xml'
+        testRunTitle: 'Publish test results for PyPy3'
+        failTaskOnFailedTests: true
diff --git a/azure-steps-windows.yml b/azure-steps-windows.yml
new file mode 100644
index 000000000..26d7a667d
--- /dev/null
+++ b/azure-steps-windows.yml
@@ -0,0 +1,56 @@
+steps:
+- task: UsePythonVersion@0
+  inputs:
+    versionSpec: $(PYTHON_VERSION)
+    addToPath: true
+    architecture: $(PYTHON_ARCH)
+- script: python -m pip install --upgrade pip setuptools wheel
+  displayName: 'Install tools'
+- script: python -m pip install -r test_requirements.txt
+  displayName: 'Install dependencies; some are optional to avoid test skips'
+- powershell: |
+    $pyversion = python -c "from __future__ import print_function; import sys; print(sys.version.split()[0])"
+    Write-Host "Python Version: $pyversion"
+    $target = "C:\\hostedtoolcache\\windows\\Python\\$pyversion\\$(PYTHON_ARCH)\\lib\\openblas.a"
+    Write-Host "target path: $target"
+    $openblas = python tools/openblas_support.py
+    cp $openblas $target
+  displayName: 'Download / Install OpenBLAS'
+
+- powershell: |
+    choco install -y mingw --forcex86 --force --version=5.3.0
+  displayName: 'Install 32-bit mingw for 32-bit builds'
+  condition: eq(variables['BITS'], 32)
+# NOTE: for Windows builds it seems much more tractable to use runtests.py
+# vs. manual setup.py and then runtests.py for testing only
+- powershell: |
+    If ($(BITS) -eq 32) {
+        $env:CFLAGS = "-m32"
+        $env:LDFLAGS = "-m32"
+        $env:PATH = "C:\\tools\\mingw32\\bin;" + $env:PATH
+        refreshenv
+    }
+    python -c "from tools import openblas_support; openblas_support.make_init('numpy')"
+    pip wheel -v -v -v --wheel-dir=dist .
+
+    ls dist -r | Foreach-Object {
+        pip install $_.FullName
+    }
+  displayName: 'Build NumPy'
+- bash: |
+    pushd . && cd .. && target=$(python -c "import numpy, os; print(os.path.abspath(os.path.join(os.path.dirname(numpy.__file__), '.libs')))") && popd
+    pip download -d destination --only-binary --no-deps numpy==1.14
+    cd destination && unzip numpy*.whl && cp numpy/.libs/*.dll $target
+    ls $target
+  displayName: 'Add extraneous & older DLL to numpy/.libs to probe DLL handling robustness'
+  condition: eq(variables['PYTHON_VERSION'], '3.6')
+- script: pushd . && cd .. && python -c "from ctypes import windll; windll.kernel32.SetDefaultDllDirectories(0x00000800); import numpy" && popd
+  displayName: 'For gh-12667; Windows DLL resolution'
+- script: python runtests.py -n --show-build-log --mode=$(TEST_MODE) -- -rsx --junitxml=junit/test-results.xml
+  displayName: 'Run NumPy Test Suite'
+- task: PublishTestResults@2
+  condition: succeededOrFailed()
+  inputs:
+    testResultsFiles: '**/test-*.xml'
+    failTaskOnFailedTests: true
+    testRunTitle: 'Publish test results for Python $(PYTHON_VERSION) $(BITS)-bit $(TEST_MODE) Windows'
+\ No newline at end of file
diff --git a/benchmarks/benchmarks/bench_avx.py b/benchmarks/benchmarks/bench_avx.py
new file mode 100644
index 000000000..f7b524e43
--- /dev/null
+++ b/benchmarks/benchmarks/bench_avx.py
@@ -0,0 +1,34 @@
+from __future__ import absolute_import, division, print_function
+
+from .common import Benchmark
+
+import numpy as np
+
+avx_ufuncs = ['sqrt',
+              'absolute',
+              'reciprocal',
+              'square',
+              'rint',
+              'floor',
+              'ceil' ,
+              'trunc']
+stride = [1, 2, 4]
+dtype  = ['f', 'd']
+
+class AVX_UFunc(Benchmark):
+    params = [avx_ufuncs, stride, dtype]
+    param_names = ['avx_based_ufunc', 'stride', 'dtype']
+    timeout = 10
+
+    def setup(self, ufuncname, stride, dtype):
+        np.seterr(all='ignore')
+        try:
+            self.f = getattr(np, ufuncname)
+        except AttributeError:
+            raise NotImplementedError()
+        N = 10000
+        self.arr = np.ones(stride*N, dtype)
+
+    def time_ufunc(self, ufuncname, stride, dtype):
+        self.f(self.arr[::stride])
+
diff --git a/doc/changelog/1.17.3-changelog.rst b/doc/changelog/1.17.3-changelog.rst
new file mode 100644
index 000000000..f911c8465
--- /dev/null
+++ b/doc/changelog/1.17.3-changelog.rst
@@ -0,0 +1,32 @@
+
+Contributors
+============
+
+A total of 7 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Allan Haldane
+* Charles Harris
+* Kevin Sheppard
+* Matti Picus
+* Ralf Gommers
+* Sebastian Berg
+* Warren Weckesser
+
+Pull requests merged
+====================
+
+A total of 12 pull requests were merged for this release.
+
+* `#14456 <https://github.com/numpy/numpy/pull/14456>`__: MAINT: clean up pocketfft modules inside numpy.fft namespace.
+* `#14463 <https://github.com/numpy/numpy/pull/14463>`__: BUG: random.hypergeometic assumes npy_long is npy_int64, hung...
+* `#14502 <https://github.com/numpy/numpy/pull/14502>`__: BUG: random: Revert gh-14458 and refix gh-14557.
+* `#14504 <https://github.com/numpy/numpy/pull/14504>`__: BUG: add a specialized loop for boolean matmul.
+* `#14506 <https://github.com/numpy/numpy/pull/14506>`__: MAINT: Update pytest version for Python 3.8
+* `#14512 <https://github.com/numpy/numpy/pull/14512>`__: DOC: random: fix doc linking, was referencing private submodules.
+* `#14513 <https://github.com/numpy/numpy/pull/14513>`__: BUG,MAINT: Some fixes and minor cleanup based on clang analysis
+* `#14515 <https://github.com/numpy/numpy/pull/14515>`__: BUG: Fix randint when range is 2**32
+* `#14519 <https://github.com/numpy/numpy/pull/14519>`__: MAINT: remove the entropy c-extension module
+* `#14563 <https://github.com/numpy/numpy/pull/14563>`__: DOC: remove note about Pocketfft license file (non-existing here).
+* `#14578 <https://github.com/numpy/numpy/pull/14578>`__: BUG: random: Create a legacy implementation of random.binomial.
+* `#14687 <https://github.com/numpy/numpy/pull/14687>`__: BUG: properly define PyArray_DescrCheck
diff --git a/doc/neps/nep-0029-deprecation_policy.rst b/doc/neps/nep-0029-deprecation_policy.rst
index 0dea0a96f..2f5c8ecb5 100644
--- a/doc/neps/nep-0029-deprecation_policy.rst
+++ b/doc/neps/nep-0029-deprecation_policy.rst
@@ -4,9 +4,10 @@ NEP 29 — Recommend Python and Numpy version support as a community policy stan
 
 
 :Author: Thomas A Caswell <tcaswell@gmail.com>, Andreas Mueller, Brian Granger, Madicken Munk, Ralf Gommers, Matt Haberland <mhaberla@calpoly.edu>, Matthias Bussonnier <bussonniermatthias@gmail.com>, Stefan van der Walt <stefanv@berkeley.edu>
-:Status: Draft
-:Type: Informational Track
+:Status: Final
+:Type: Informational
 :Created: 2019-07-13
+:Resolution: https://mail.python.org/pipermail/numpy-discussion/2019-October/080128.html
 
 
 Abstract
@@ -164,7 +165,7 @@ the minimum version of Python supported.
 As a major downside, an ad-hoc approach makes it hard for downstream users to predict what
 the future minimum versions will be.  As there is no objective threshold
 to when the minimum version should be dropped, it is easy for these
-version support discussions to devolve into [bike shedding](https://en.wikipedia.org/wiki/Wikipedia:Avoid_Parkinson%27s_bicycle-shed_effect) and acrimony.
+version support discussions to devolve into `bike shedding <https://en.wikipedia.org/wiki/Wikipedia:Avoid_Parkinson%27s_bicycle-shed_effect>`_ and acrimony.
 
 
 All CPython supported versions
diff --git a/changelog/13829.enhancement.rst b/doc/release/upcoming_changes/13829.improvement.rst
index ede1b2a53..ede1b2a53 100644
--- a/changelog/13829.enhancement.rst
+++ b/doc/release/upcoming_changes/13829.improvement.rst
diff --git a/doc/release/upcoming_changes/14717.compatibility.rst b/doc/release/upcoming_changes/14717.compatibility.rst
new file mode 100644
index 000000000..f6f0ec8e5
--- /dev/null
+++ b/doc/release/upcoming_changes/14717.compatibility.rst
@@ -0,0 +1,4 @@
+``numpy.argmin/argmax/min/max`` returns ``NaT`` if it exists in array
+---------------------------------------------------------------------
+``numpy.argmin``, ``numpy.argmax``, ``numpy.min``, and ``numpy.max`` will return
+``NaT`` if it exists in the array.
diff --git a/doc/release/upcoming_changes/14720.deprecation.rst b/doc/release/upcoming_changes/14720.deprecation.rst
new file mode 100644
index 000000000..46ad6d8f7
--- /dev/null
+++ b/doc/release/upcoming_changes/14720.deprecation.rst
@@ -0,0 +1,8 @@
+Deprecate the financial functions
+---------------------------------
+In accordance with
+`NEP-32 <https://numpy.org/neps/nep-0032-remove-financial-functions.html>`_,
+the functions `fv`, `ipmt`, `irr`, `mirr`, `nper`, `npv`, `pmt`, `ppmt`,
+`pv` and `rate` are deprecated, and will be removed from NumPy 1.20.
+The replacement for these functions is the Python package
+`numpy-financial <https://pypi.org/project/numpy-financial>`_.
diff --git a/doc/source/reference/maskedarray.generic.rst b/doc/source/reference/maskedarray.generic.rst
index 7375d60fb..41c3ee564 100644
--- a/doc/source/reference/maskedarray.generic.rst
+++ b/doc/source/reference/maskedarray.generic.rst
@@ -74,7 +74,7 @@ To create an array with the second element invalid, we would do::
 To create a masked array where all values close to 1.e20 are invalid, we would
 do::
 
-   >>> z = masked_values([1.0, 1.e20, 3.0, 4.0], 1.e20)
+   >>> z = ma.masked_values([1.0, 1.e20, 3.0, 4.0], 1.e20)
 
 For a complete discussion of creation methods for masked arrays please see
 section :ref:`Constructing masked arrays <maskedarray.generic.constructing>`.
@@ -110,15 +110,15 @@ There are several ways to construct a masked array.
 
      >>> x = np.array([1, 2, 3])
      >>> x.view(ma.MaskedArray)
-     masked_array(data = [1 2 3],
-                  mask = False,
-            fill_value = 999999)
+     masked_array(data=[1, 2, 3],
+                  mask=False,
+            fill_value=999999)
      >>> x = np.array([(1, 1.), (2, 2.)], dtype=[('a',int), ('b', float)])
      >>> x.view(ma.MaskedArray)
-     masked_array(data = [(1, 1.0) (2, 2.0)],
-                  mask = [(False, False) (False, False)],
-            fill_value = (999999, 1e+20),
-                 dtype = [('a', '<i4'), ('b', '<f8')])
+     masked_array(data=[(1, 1.0), (2, 2.0)],
+                  mask=[(False, False), (False, False)],
+            fill_value=(999999, 1.e+20),
+                 dtype=[('a', '<i8'), ('b', '<f8')])
 
 * Yet another possibility is to use any of the following functions:
 
@@ -195,9 +195,9 @@ index. The inverse of the mask can be calculated with the
 
    >>> x = ma.array([[1, 2], [3, 4]], mask=[[0, 1], [1, 0]])
    >>> x[~x.mask]
-   masked_array(data = [1 4],
-                mask = [False False],
-          fill_value = 999999)
+   masked_array(data=[1, 4],
+                mask=[False, False],
+          fill_value=999999)
 
 Another way to retrieve the valid data is to use the :meth:`compressed`
 method, which returns a one-dimensional :class:`~numpy.ndarray` (or one of its
@@ -223,27 +223,26 @@ as invalid is to assign the special value :attr:`masked` to them::
    >>> x = ma.array([1, 2, 3])
    >>> x[0] = ma.masked
    >>> x
-   masked_array(data = [-- 2 3],
-                mask = [ True False False],
-          fill_value = 999999)
+   masked_array(data=[--, 2, 3],
+                mask=[ True, False, False],
+          fill_value=999999)
    >>> y = ma.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    >>> y[(0, 1, 2), (1, 2, 0)] = ma.masked
    >>> y
-   masked_array(data =
-    [[1 -- 3]
-     [4 5 --]
-     [-- 8 9]],
-                mask =
-    [[False  True False]
-     [False False  True]
-     [ True False False]],
-          fill_value = 999999)
+   masked_array(
+     data=[[1, --, 3],
+           [4, 5, --],
+           [--, 8, 9]],
+     mask=[[False,  True, False],
+           [False, False,  True],
+           [ True, False, False]],
+     fill_value=999999)
    >>> z = ma.array([1, 2, 3, 4])
    >>> z[:-2] = ma.masked
    >>> z
-   masked_array(data = [-- -- 3 4],
-                mask = [ True  True False False],
-          fill_value = 999999)
+   masked_array(data=[--, --, 3, 4],
+                mask=[ True,  True, False, False],
+          fill_value=999999)
 
 
 A second possibility is to modify the :attr:`~MaskedArray.mask` directly,
@@ -263,9 +262,10 @@ mask::
    >>> x = ma.array([1, 2, 3], mask=[0, 0, 1])
    >>> x.mask = True
    >>> x
-   masked_array(data = [-- -- --],
-                mask = [ True  True  True],
-          fill_value = 999999)
+   masked_array(data=[--, --, --],
+                mask=[ True,  True,  True],
+          fill_value=999999,
+               dtype=int64)
 
 Finally, specific entries can be masked and/or unmasked by assigning to the
 mask a sequence of booleans::
@@ -273,9 +273,9 @@ mask a sequence of booleans::
    >>> x = ma.array([1, 2, 3])
    >>> x.mask = [0, 1, 0]
    >>> x
-   masked_array(data = [1 -- 3],
-                mask = [False  True False],
-          fill_value = 999999)
+   masked_array(data=[1, --, 3],
+                mask=[False,  True, False],
+          fill_value=999999)
 
 Unmasking an entry
 ~~~~~~~~~~~~~~~~~~
@@ -285,14 +285,14 @@ new valid values to them::
 
    >>> x = ma.array([1, 2, 3], mask=[0, 0, 1])
    >>> x
-   masked_array(data = [1 2 --],
-                mask = [False False  True],
-          fill_value = 999999)
+   masked_array(data=[1, 2, --],
+                mask=[False, False,  True],
+          fill_value=999999)
    >>> x[-1] = 5
    >>> x
-   masked_array(data = [1 2 5],
-                mask = [False False False],
-          fill_value = 999999)
+   masked_array(data=[1, 2, 5],
+                mask=[False, False, False],
+          fill_value=999999)
 
 .. note::
    Unmasking an entry by direct assignment will silently fail if the masked
@@ -304,21 +304,27 @@ new valid values to them::
 
       >>> x = ma.array([1, 2, 3], mask=[0, 0, 1], hard_mask=True)
       >>> x
-      masked_array(data = [1 2 --],
-                   mask = [False False  True],
-             fill_value = 999999)
+      masked_array(data=[1, 2, --],
+                   mask=[False, False,  True],
+             fill_value=999999)
       >>> x[-1] = 5
       >>> x
-      masked_array(data = [1 2 --],
-                   mask = [False False  True],
-             fill_value = 999999)
+      masked_array(data=[1, 2, --],
+                   mask=[False, False,  True],
+             fill_value=999999)
       >>> x.soften_mask()
+      masked_array(data=[1, 2, --],
+                   mask=[False, False,  True],
+             fill_value=999999)
       >>> x[-1] = 5
       >>> x
-      masked_array(data = [1 2 5],
-                   mask = [False False  False],
-             fill_value = 999999)
+      masked_array(data=[1, 2, 5],
+                   mask=[False, False, False],
+             fill_value=999999)
       >>> x.harden_mask()
+      masked_array(data=[1, 2, 5],
+                   mask=[False, False, False],
+             fill_value=999999)
 
 
 To unmask all masked entries of a masked array (provided the mask isn't a hard
@@ -327,15 +333,14 @@ mask::
 
    >>> x = ma.array([1, 2, 3], mask=[0, 0, 1])
    >>> x
-   masked_array(data = [1 2 --],
-                mask = [False False  True],
-          fill_value = 999999)
+   masked_array(data=[1, 2, --],
+                mask=[False, False,  True],
+          fill_value=999999)
    >>> x.mask = ma.nomask
    >>> x
-   masked_array(data = [1 2 3],
-                mask = [False False False],
-          fill_value = 999999)
-
+   masked_array(data=[1, 2, 3],
+                mask=[False, False, False],
+          fill_value=999999)
 
 
 Indexing and slicing
@@ -353,9 +358,7 @@ the mask is ``True``)::
    >>> x[0]
    1
    >>> x[-1]
-   masked_array(data = --,
-                mask = True,
-          fill_value = 1e+20)
+   masked
    >>> x[-1] is ma.masked
    True
 
@@ -370,10 +373,7 @@ is masked.
    >>> y[0]
    (1, 2)
    >>> y[-1]
-   masked_array(data = (3, --),
-                mask = (False, True),
-          fill_value = (999999, 999999),
-               dtype = [('a', '<i4'), ('b', '<i4')])
+   (3, --)
 
 
 When accessing a slice, the output is a masked array whose
@@ -385,20 +385,19 @@ required to ensure propagation of any modification of the mask to the original.
    >>> x = ma.array([1, 2, 3, 4, 5], mask=[0, 1, 0, 0, 1])
    >>> mx = x[:3]
    >>> mx
-   masked_array(data = [1 -- 3],
-                mask = [False  True False],
-          fill_value = 999999)
+   masked_array(data=[1, --, 3],
+                mask=[False,  True, False],
+          fill_value=999999)
    >>> mx[1] = -1
    >>> mx
-   masked_array(data = [1 -1 3],
-                mask = [False False False],
-          fill_value = 999999)
+   masked_array(data=[1, -1, 3],
+                mask=[False, False, False],
+          fill_value=999999)
    >>> x.mask
-   array([False,  True, False, False,  True])
+   array([False, False, False, False,  True])
    >>> x.data
    array([ 1, -1,  3,  4,  5])
 
-
 Accessing a field of a masked array with structured datatype returns a
 :class:`MaskedArray`.
 
@@ -421,9 +420,9 @@ ufuncs. Unary and binary functions that have a validity domain (such as
 constant whenever the input is masked or falls outside the validity domain::
 
    >>> ma.log([-1, 0, 1, 2])
-   masked_array(data = [-- -- 0.0 0.69314718056],
-                mask = [ True  True False False],
-          fill_value = 1e+20)
+   masked_array(data=[--, --, 0.0, 0.6931471805599453],
+                mask=[ True,  True, False, False],
+          fill_value=1e+20)
 
 Masked arrays also support standard numpy ufuncs. The output is then a masked
 array. The result of a unary ufunc is masked wherever the input is masked. The
@@ -435,10 +434,9 @@ input fall outside the validity domain::
 
    >>> x = ma.array([-1, 1, 0, 2, 3], mask=[0, 0, 0, 0, 1])
    >>> np.log(x)
-   masked_array(data = [-- -- 0.0 0.69314718056 --],
-                mask = [ True  True False False  True],
-          fill_value = 1e+20)
-
+   masked_array(data=[--, 0.0, --, 0.6931471805599453, --],
+                mask=[ True, False,  True, False,  True],
+          fill_value=1e+20)
 
 
 Examples
@@ -454,11 +452,11 @@ of anomalies (deviations from the average)::
    >>> import numpy.ma as ma
    >>> x = [0.,1.,-9999.,3.,4.]
    >>> mx = ma.masked_values (x, -9999.)
-   >>> print mx.mean()
+   >>> print(mx.mean())
    2.0
-   >>> print mx - mx.mean()
+   >>> print(mx - mx.mean())
    [-2.0 -1.0 -- 1.0 2.0]
-   >>> print mx.anom()
+   >>> print(mx.anom())
    [-2.0 -1.0 -- 1.0 2.0]
 
 
@@ -468,7 +466,7 @@ Filling in the missing data
 Suppose now that we wish to print that same data, but with the missing values
 replaced by the average value.
 
-   >>> print mx.filled(mx.mean())
+   >>> print(mx.filled(mx.mean()))
    [ 0.  1.  2.  3.  4.]
 
 
@@ -478,10 +476,10 @@ Numerical operations
 Numerical operations can be easily performed without worrying about missing
 values, dividing by zero, square roots of negative numbers, etc.::
 
-   >>> import numpy as np, numpy.ma as ma
+   >>> import numpy.ma as ma
    >>> x = ma.array([1., -1., 3., 4., 5., 6.], mask=[0,0,0,0,1,0])
    >>> y = ma.array([1., 2., 0., 4., 5., 6.], mask=[0,0,0,0,0,1])
-   >>> print np.sqrt(x/y)
+   >>> print(ma.sqrt(x/y))
    [1.0 -- -- 1.0 -- --]
 
 Four values of the output are invalid: the first one comes from taking the
@@ -492,8 +490,10 @@ the last two where the inputs were masked.
 Ignoring extreme values
 -----------------------
 
-Let's consider an array ``d`` of random floats between 0 and 1. We wish to
+Let's consider an array ``d`` of floats between 0 and 1. We wish to
 compute the average of the values of ``d`` while ignoring any data outside
-the range ``[0.1, 0.9]``::
+the range ``[0.2, 0.9]``::
 
-   >>> print ma.masked_outside(d, 0.1, 0.9).mean()
+   >>> d = np.linspace(0, 1, 20)
+   >>> print(d.mean() - ma.masked_outside(d, 0.2, 0.9).mean())
+   -0.05263157894736836
diff --git a/doc/source/reference/random/bit_generators/bitgenerators.rst b/doc/source/reference/random/bit_generators/bitgenerators.rst
deleted file mode 100644
index 1474f7dac..000000000
--- a/doc/source/reference/random/bit_generators/bitgenerators.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-:orphan:
-
-BitGenerator
-------------
-
-.. currentmodule:: numpy.random.bit_generator
-
-.. autosummary::
-   :toctree: generated/
-
-    BitGenerator
diff --git a/doc/source/reference/random/bit_generators/index.rst b/doc/source/reference/random/bit_generators/index.rst
index 35d9e5d09..94d3d8a3c 100644
--- a/doc/source/reference/random/bit_generators/index.rst
+++ b/doc/source/reference/random/bit_generators/index.rst
@@ -1,5 +1,3 @@
-.. _bit_generator:
-
 .. currentmodule:: numpy.random
 
 Bit Generators
@@ -35,14 +33,18 @@ The included BitGenerators are:
 .. _`Random123`: https://www.deshawresearch.com/resources_random123.html
 .. _`SFC author's page`: http://pracrand.sourceforge.net/RNG_engines.txt
 
+.. autosummary::
+    :toctree: generated/
+
+    BitGenerator
+
 .. toctree::
-   :maxdepth: 1
+    :maxdepth: 1
 
-   BitGenerator <bitgenerators>
-   MT19937 <mt19937>
-   PCG64 <pcg64>
-   Philox <philox>
-   SFC64 <sfc64>
+    MT19937 <mt19937>
+    PCG64 <pcg64>
+    Philox <philox>
+    SFC64 <sfc64>
 
 Seeding and Entropy
 -------------------
@@ -53,14 +55,14 @@ seed. All of the provided BitGenerators will take an arbitrary-sized
 non-negative integer, or a list of such integers, as a seed. BitGenerators
 need to take those inputs and process them into a high-quality internal state
 for the BitGenerator. All of the BitGenerators in numpy delegate that task to
-`~SeedSequence`, which uses hashing techniques to ensure that even low-quality
+`SeedSequence`, which uses hashing techniques to ensure that even low-quality
 seeds generate high-quality initial states.
 
 .. code-block:: python
 
-  from numpy.random import PCG64
+    from numpy.random import PCG64
 
-  bg = PCG64(12345678903141592653589793)
+    bg = PCG64(12345678903141592653589793)
 
 .. end_block
 
@@ -75,14 +77,14 @@ user, which is up to you.
 
 .. code-block:: python
 
-  from numpy.random import PCG64, SeedSequence
+    from numpy.random import PCG64, SeedSequence
 
-  # Get the user's seed somehow, maybe through `argparse`.
-  # If the user did not provide a seed, it should return `None`.
-  seed = get_user_seed()
-  ss = SeedSequence(seed)
-  print('seed = {}'.format(ss.entropy))
-  bg = PCG64(ss)
+    # Get the user's seed somehow, maybe through `argparse`.
+    # If the user did not provide a seed, it should return `None`.
+    seed = get_user_seed()
+    ss = SeedSequence(seed)
+    print('seed = {}'.format(ss.entropy))
+    bg = PCG64(ss)
 
 .. end_block
 
@@ -104,9 +106,6 @@ or using ``secrets.randbits(128)`` from the standard library are both
 convenient ways.
 
 .. autosummary::
-   :toctree: generated/
+    :toctree: generated/
 
     SeedSequence
-    bit_generator.ISeedSequence
-    bit_generator.ISpawnableSeedSequence
-    bit_generator.SeedlessSeedSequence
diff --git a/doc/source/reference/random/index.rst b/doc/source/reference/random/index.rst
index b0283f3a7..9b19620d8 100644
--- a/doc/source/reference/random/index.rst
+++ b/doc/source/reference/random/index.rst
@@ -123,7 +123,7 @@ The `Generator` is the user-facing object that is nearly identical to
   rg.random()
 
 One can also instantiate `Generator` directly with a `BitGenerator` instance.
-To use the older `~mt19937.MT19937` algorithm, one can instantiate it directly
+To use the older `MT19937` algorithm, one can instantiate it directly
 and pass it to `Generator`.
 
 .. code-block:: python
diff --git a/doc/source/reference/random/new-or-different.rst b/doc/source/reference/random/new-or-different.rst
index c8815f98f..b3bddb443 100644
--- a/doc/source/reference/random/new-or-different.rst
+++ b/doc/source/reference/random/new-or-different.rst
@@ -10,9 +10,10 @@ What's New or Different
   The Box-Muller method used to produce NumPy's normals is no longer available
   in `Generator`.  It is not possible to reproduce the exact random
   values using ``Generator`` for the normal distribution or any other
-  distribution that relies on the normal such as the `gamma` or
-  `standard_t`. If you require bitwise backward compatible
-  streams, use `RandomState`.
+  distribution that relies on the normal such as the `Generator.gamma` or
+  `Generator.standard_t`. If you require bitwise backward compatible
+  streams, use `RandomState`, i.e., `RandomState.gamma` or
+  `RandomState.standard_t`.
 
 Quick comparison of legacy `mtrand <legacy>`_ to the new `Generator`
 
@@ -20,9 +21,9 @@ Quick comparison of legacy `mtrand <legacy>`_ to the new `Generator`
 Feature            Older Equivalent     Notes
 ------------------ -------------------- -------------
 `~.Generator`      `~.RandomState`      ``Generator`` requires a stream
-                                        source, called a `BitGenerator
-                                        <bit_generators>` A number of these
-                                        are provided.  ``RandomState`` uses
+                                        source, called a `BitGenerator`
+                                        A number of these are provided.
+                                        ``RandomState`` uses
                                         the Mersenne Twister `~.MT19937` by
                                         default, but can also be instantiated
                                         with any BitGenerator.
diff --git a/doc/source/reference/random/parallel.rst b/doc/source/reference/random/parallel.rst
index 2f79f22d8..721584014 100644
--- a/doc/source/reference/random/parallel.rst
+++ b/doc/source/reference/random/parallel.rst
@@ -18,10 +18,10 @@ a `~BitGenerator`. It uses hashing techniques to ensure that low-quality seeds
 are turned into high quality initial states (at least, with very high
 probability).
 
-For example, `~mt19937.MT19937` has a state consisting of 624
+For example, `MT19937` has a state consisting of 624
 `uint32` integers. A naive way to take a 32-bit integer seed would be to just set
 the last element of the state to the 32-bit seed and leave the rest 0s. This is
-a valid state for `~mt19937.MT19937`, but not a good one. The Mersenne Twister
+a valid state for `MT19937`, but not a good one. The Mersenne Twister
 algorithm `suffers if there are too many 0s`_. Similarly, two adjacent 32-bit
 integer seeds (i.e. ``12345`` and ``12346``) would produce very similar
 streams.
@@ -91,15 +91,15 @@ territory ([2]_).
 .. [2] In this calculation, we can ignore the amount of numbers drawn from each
        stream. Each of the PRNGs we provide has some extra protection built in
        that avoids overlaps if the `~SeedSequence` pools differ in the
-       slightest bit. `~pcg64.PCG64` has :math:`2^{127}` separate cycles
+       slightest bit. `PCG64` has :math:`2^{127}` separate cycles
        determined by the seed in addition to the position in the
        :math:`2^{128}` long period for each cycle, so one has to both get on or
        near the same cycle *and* seed a nearby position in the cycle.
-       `~philox.Philox` has completely independent cycles determined by the seed.
-       `~sfc64.SFC64` incorporates a 64-bit counter so every unique seed is at
+       `Philox` has completely independent cycles determined by the seed.
+       `SFC64` incorporates a 64-bit counter so every unique seed is at
        least :math:`2^{64}` iterations away from any other seed. And
-       finally, `~mt19937.MT19937` has just an unimaginably huge period. Getting
-       a collision internal to `~SeedSequence` is the way a failure would be
+       finally, `MT19937` has just an unimaginably huge period. Getting
+       a collision internal to `SeedSequence` is the way a failure would be
        observed.
 
 .. _`implements an algorithm`: http://www.pcg-random.org/posts/developing-a-seed_seq-alternative.html
@@ -113,10 +113,10 @@ territory ([2]_).
 Independent Streams
 -------------------
 
-:class:`~philox.Philox` is a counter-based RNG based which generates values by
+`Philox` is a counter-based RNG based which generates values by
 encrypting an incrementing counter using weak cryptographic primitives. The
 seed determines the key that is used for the encryption. Unique keys create
-unique, independent streams. :class:`~philox.Philox` lets you bypass the
+unique, independent streams. `Philox` lets you bypass the
 seeding algorithm to directly set the 128-bit key. Similar, but different, keys
 will still create independent streams.
 
diff --git a/doc/source/reference/random/performance.rst b/doc/source/reference/random/performance.rst
index 2d5fca496..d70dd064a 100644
--- a/doc/source/reference/random/performance.rst
+++ b/doc/source/reference/random/performance.rst
@@ -5,21 +5,21 @@ Performance
 
 Recommendation
 **************
-The recommended generator for general use is :class:`~pcg64.PCG64`. It is
+The recommended generator for general use is `PCG64`. It is
 statistically high quality, full-featured, and fast on most platforms, but
 somewhat slow when compiled for 32-bit processes.
 
-:class:`~philox.Philox` is fairly slow, but its statistical properties have
+`Philox` is fairly slow, but its statistical properties have
 very high quality, and it is easy to get assuredly-independent stream by using
 unique keys. If that is the style you wish to use for parallel streams, or you
 are porting from another system that uses that style, then
-:class:`~philox.Philox` is your choice.
+`Philox` is your choice.
 
-:class:`~sfc64.SFC64` is statistically high quality and very fast. However, it
+`SFC64` is statistically high quality and very fast. However, it
 lacks jumpability. If you are not using that capability and want lots of speed,
 even on 32-bit processes, this is your choice.
 
-:class:`~mt19937.MT19937` `fails some statistical tests`_ and is not especially
+`MT19937` `fails some statistical tests`_ and is not especially
 fast compared to modern PRNGs. For these reasons, we mostly do not recommend
 using it on its own, only through the legacy `~.RandomState` for
 reproducing old results. That said, it has a very long history as a default in
@@ -31,20 +31,20 @@ Timings
 *******
 
 The timings below are the time in ns to produce 1 random value from a
-specific distribution.  The original :class:`~mt19937.MT19937` generator is
+specific distribution.  The original `MT19937` generator is
 much slower since it requires 2 32-bit values to equal the output of the
 faster generators.
 
 Integer performance has a similar ordering.
 
 The pattern is similar for other, more complex generators. The normal
-performance of the legacy :class:`~.RandomState` generator is much
+performance of the legacy `RandomState` generator is much
 lower than the other since it uses the Box-Muller transformation rather
 than the Ziggurat generator. The performance gap for Exponentials is also
 large due to the cost of computing the log function to invert the CDF.
 The column labeled MT19973 is used the same 32-bit generator as
-:class:`~.RandomState` but produces random values using
-:class:`~Generator`.
+`RandomState` but produces random values using
+`Generator`.
 
 .. csv-table::
     :header: ,MT19937,PCG64,Philox,SFC64,RandomState
@@ -61,7 +61,7 @@ The column labeled MT19973 is used the same 32-bit generator as
     Poissons,67.6,52.4,69.2,46.4,78.1
 
 The next table presents the performance in percentage relative to values
-generated by the legacy generator, `RandomState(MT19937())`. The overall
+generated by the legacy generator, ``RandomState(MT19937())``. The overall
 performance was computed using a geometric mean.
 
 .. csv-table::
diff --git a/doc/source/release.rst b/doc/source/release.rst
index fb4e2b14d..3bfe81243 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -6,7 +6,8 @@ Release Notes
     :maxdepth: 3
 
     1.18.0 <release/1.18.0-notes>
-    1.17.1 <release/1.17.2-notes>
+    1.17.3 <release/1.17.3-notes>
+    1.17.2 <release/1.17.2-notes>
     1.17.1 <release/1.17.1-notes>
     1.17.0 <release/1.17.0-notes>
     1.16.5 <release/1.16.5-notes>
diff --git a/doc/source/release/1.17.0-notes.rst b/doc/source/release/1.17.0-notes.rst
index 8d69e36d9..a0e737982 100644
--- a/doc/source/release/1.17.0-notes.rst
+++ b/doc/source/release/1.17.0-notes.rst
@@ -239,7 +239,7 @@ New extensible `numpy.random` module with selectable random number generators
 -----------------------------------------------------------------------------
 A new extensible `numpy.random` module along with four selectable random number
 generators and improved seeding designed for use in parallel processes has been
-added. The currently available :ref:`Bit Generators <bit_generator>` are
+added. The currently available `Bit Generators` are
 `~mt19937.MT19937`, `~pcg64.PCG64`, `~philox.Philox`, and `~sfc64.SFC64`.
 ``PCG64`` is the new default while ``MT19937`` is retained for backwards
 compatibility. Note that the legacy random module is unchanged and is now
diff --git a/doc/source/release/1.17.3-notes.rst b/doc/source/release/1.17.3-notes.rst
new file mode 100644
index 000000000..e33ca1917
--- /dev/null
+++ b/doc/source/release/1.17.3-notes.rst
@@ -0,0 +1,59 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.17.3 Release Notes
+==========================
+
+This release contains fixes for bugs reported against NumPy 1.17.2 along with a
+some documentation improvements. The Python versions supported in this release
+are 3.5-3.8.
+
+Downstream developers should use Cython >= 0.29.13 for Python 3.8 support and
+OpenBLAS >= 3.7 to avoid errors on the Skylake architecture.
+
+
+Highlights
+==========
+
+- Wheels for Python 3.8
+- Boolean ``matmul`` fixed to use booleans instead of integers.
+
+
+Compatibility notes
+===================
+
+- The seldom used ``PyArray_DescrCheck`` macro has been changed/fixed.
+
+
+Contributors
+============
+
+A total of 7 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Allan Haldane
+* Charles Harris
+* Kevin Sheppard
+* Matti Picus
+* Ralf Gommers
+* Sebastian Berg
+* Warren Weckesser
+
+
+Pull requests merged
+====================
+
+A total of 12 pull requests were merged for this release.
+
+* `#14456 <https://github.com/numpy/numpy/pull/14456>`__: MAINT: clean up pocketfft modules inside numpy.fft namespace.
+* `#14463 <https://github.com/numpy/numpy/pull/14463>`__: BUG: random.hypergeometic assumes npy_long is npy_int64, hung...
+* `#14502 <https://github.com/numpy/numpy/pull/14502>`__: BUG: random: Revert gh-14458 and refix gh-14557.
+* `#14504 <https://github.com/numpy/numpy/pull/14504>`__: BUG: add a specialized loop for boolean matmul.
+* `#14506 <https://github.com/numpy/numpy/pull/14506>`__: MAINT: Update pytest version for Python 3.8
+* `#14512 <https://github.com/numpy/numpy/pull/14512>`__: DOC: random: fix doc linking, was referencing private submodules.
+* `#14513 <https://github.com/numpy/numpy/pull/14513>`__: BUG,MAINT: Some fixes and minor cleanup based on clang analysis
+* `#14515 <https://github.com/numpy/numpy/pull/14515>`__: BUG: Fix randint when range is 2**32
+* `#14519 <https://github.com/numpy/numpy/pull/14519>`__: MAINT: remove the entropy c-extension module
+* `#14563 <https://github.com/numpy/numpy/pull/14563>`__: DOC: remove note about Pocketfft license file (non-existing here).
+* `#14578 <https://github.com/numpy/numpy/pull/14578>`__: BUG: random: Create a legacy implementation of random.binomial.
+* `#14687 <https://github.com/numpy/numpy/pull/14687>`__: BUG: properly define PyArray_DescrCheck
diff --git a/doc/source/user/quickstart.rst b/doc/source/user/quickstart.rst
index a23a7b2c7..6211d0c69 100644
--- a/doc/source/user/quickstart.rst
+++ b/doc/source/user/quickstart.rst
@@ -206,8 +206,8 @@ of elements that we want, instead of the step::
     `empty_like`,
     `arange`,
     `linspace`,
-    `numpy.random.mtrand.RandomState.rand`,
-    `numpy.random.mtrand.RandomState.randn`,
+    `numpy.random.RandomState.rand`,
+    `numpy.random.RandomState.randn`,
     `fromfunction`,
     `fromfile`
 
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index b241d1f9d..05e401e0b 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -247,55 +247,13 @@ class _missing_ctypes(object):
             self.value = ptr
 
 
-class _unsafe_first_element_pointer(object):
-    """
-    Helper to allow viewing an array as a ctypes pointer to the first element
-
-    This avoids:
-      * dealing with strides
-      * `.view` rejecting object-containing arrays
-      * `memoryview` not supporting overlapping fields
-    """
-    def __init__(self, arr):
-        self.base = arr
-
-    @property
-    def __array_interface__(self):
-        i = dict(
-            shape=(),
-            typestr='|V0',
-            data=(self.base.__array_interface__['data'][0], False),
-            strides=(),
-            version=3,
-        )
-        return i
-
-
-def _get_void_ptr(arr):
-    """
-    Get a `ctypes.c_void_p` to arr.data, that keeps a reference to the array
-    """
-    import numpy as np
-    # convert to a 0d array that has a data pointer referrign to the start
-    # of arr. This holds a reference to arr.
-    simple_arr = np.asarray(_unsafe_first_element_pointer(arr))
-
-    # create a `char[0]` using the same memory.
-    c_arr = (ctypes.c_char * 0).from_buffer(simple_arr)
-
-    # finally cast to void*
-    return ctypes.cast(ctypes.pointer(c_arr), ctypes.c_void_p)
-
-
 class _ctypes(object):
     def __init__(self, array, ptr=None):
         self._arr = array
 
         if ctypes:
             self._ctypes = ctypes
-            # get a void pointer to the buffer, which keeps the array alive
-            self._data = _get_void_ptr(array)
-            assert self._data.value == ptr
+            self._data = self._ctypes.c_void_p(ptr)
         else:
             # fake a pointer-like object that holds onto the reference
             self._ctypes = _missing_ctypes()
@@ -317,7 +275,14 @@ class _ctypes(object):
 
         The returned pointer will keep a reference to the array.
         """
-        return self._ctypes.cast(self._data, obj)
+        # _ctypes.cast function causes a circular reference of self._data in
+        # self._data._objects. Attributes of self._data cannot be released
+        # until gc.collect is called. Make a copy of the pointer first then let
+        # it hold the array reference. This is a workaround to circumvent the
+        # CPython bug https://bugs.python.org/issue12836
+        ptr = self._ctypes.cast(self._data, obj)
+        ptr._arr = self._arr
+        return ptr
 
     def shape_as(self, obj):
         """
@@ -385,7 +350,7 @@ class _ctypes(object):
 
         Enables `c_func(some_array.ctypes)`
         """
-        return self._data
+        return self.data_as(ctypes.c_void_p)
 
     # kept for compatibility
     get_data = data.fget
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 0da6ed78a..401018015 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -1479,7 +1479,11 @@ def array_repr(arr, max_line_width=None, precision=None, suppress_small=None):
         arr, max_line_width, precision, suppress_small)
 
 
-_guarded_str = _recursive_guard()(str)
+@_recursive_guard()
+def _guarded_repr_or_str(v):
+    if isinstance(v, bytes):
+        return repr(v)
+    return str(v)
 
 
 def _array_str_implementation(
@@ -1497,7 +1501,7 @@ def _array_str_implementation(
         # obtain a scalar and call str on it, avoiding problems for subclasses
         # for which indexing with () returns a 0d instead of a scalar by using
         # ndarray's getindex. Also guard against recursive 0d object arrays.
-        return _guarded_str(np.ndarray.__getitem__(a, ()))
+        return _guarded_repr_or_str(np.ndarray.__getitem__(a, ()))
 
     return array2string(a, max_line_width, precision, suppress_small, ' ', "")
 
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 6729fe197..e0b6a654c 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -287,7 +287,7 @@ defdict = {
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)]),
+          TD(ints + inexact, simd=[('avx2', ints)]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -358,14 +358,14 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -395,7 +395,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly),
+          TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD(cmplx, out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
@@ -409,7 +409,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.negative'),
           'PyUFunc_NegativeTypeResolver',
-          TD(bints+flts+timedeltaonly, simd=[('avx2', ints)]),
+          TD(ints+flts+timedeltaonly, simd=[('avx2', ints)]),
           TD(cmplx, f='neg'),
           TD(O, f='PyNumber_Negative'),
           ),
@@ -762,7 +762,7 @@ defdict = {
           docstrings.get('numpy.core.umath.sqrt'),
           None,
           TD('e', f='sqrt', astype={'e':'f'}),
-          TD(inexactvec),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg' + cmplx, f='sqrt'),
           TD(P, f='sqrt'),
           ),
@@ -777,14 +777,18 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.ceil'),
           None,
-          TD(flts, f='ceil', astype={'e':'f'}),
+          TD('e', f='ceil', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg', f='ceil'),
           TD(O, f='npy_ObjectCeil'),
           ),
 'trunc':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.trunc'),
           None,
-          TD(flts, f='trunc', astype={'e':'f'}),
+          TD('e', f='trunc', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg', f='trunc'),
           TD(O, f='npy_ObjectTrunc'),
           ),
 'fabs':
@@ -798,14 +802,18 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.floor'),
           None,
-          TD(flts, f='floor', astype={'e':'f'}),
+          TD('e', f='floor', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg', f='floor'),
           TD(O, f='npy_ObjectFloor'),
           ),
 'rint':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.rint'),
           None,
-          TD(inexact, f='rint', astype={'e':'f'}),
+          TD('e', f='rint', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg' + cmplx, f='rint'),
           TD(P, f='rint'),
           ),
 'arctan2':
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 833dd9e47..2c148712f 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -960,6 +960,9 @@ def tensordot(a, b, axes=2):
     two sequences of the same length, with the first axis to sum over given
     first in both sequences, the second axis second, and so forth.
 
+    The shape of the result consists of the non-contracted axes of the
+    first tensor, followed by the non-contracted axes of the second.
+
     Examples
     --------
     A "traditional" example:
@@ -1781,19 +1784,19 @@ def _frombuffer(buf, dtype, shape, order):
 
 
 @set_module('numpy')
-def isscalar(num):
+def isscalar(element):
     """
-    Returns True if the type of `num` is a scalar type.
+    Returns True if the type of `element` is a scalar type.
 
     Parameters
     ----------
-    num : any
+    element : any
         Input argument, can be of any type and shape.
 
     Returns
     -------
     val : bool
-        True if `num` is a scalar type, False if it is not.
+        True if `element` is a scalar type, False if it is not.
 
     See Also
     --------
@@ -1801,10 +1804,14 @@ def isscalar(num):
 
     Notes
     -----
-    In almost all cases ``np.ndim(x) == 0`` should be used instead of this
-    function, as that will also return true for 0d arrays. This is how
-    numpy overloads functions in the style of the ``dx`` arguments to `gradient`
-    and the ``bins`` argument to `histogram`. Some key differences:
+    If you need a stricter way to identify a *numerical* scalar, use
+    ``isinstance(x, numbers.Number)``, as that returns ``False`` for most
+    non-numerical elements such as strings.
+
+    In most cases ``np.ndim(x) == 0`` should be used instead of this function,
+    as that will also return true for 0d arrays. This is how numpy overloads
+    functions in the style of the ``dx`` arguments to `gradient` and the ``bins``
+    argument to `histogram`. Some key differences:
 
     +--------------------------------------+---------------+-------------------+
     | x                                    |``isscalar(x)``|``np.ndim(x) == 0``|
@@ -1852,9 +1859,9 @@ def isscalar(num):
     True
 
     """
-    return (isinstance(num, generic)
-            or type(num) in ScalarType
-            or isinstance(num, numbers.Number))
+    return (isinstance(element, generic)
+            or type(element) in ScalarType
+            or isinstance(element, numbers.Number))
 
 
 @set_module('numpy')
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 5d9e990e8..152a2be9c 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3078,6 +3078,7 @@ BOOL_argmax(npy_bool *ip, npy_intp n, npy_intp *max_ind,
  * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8#
  * #iscomplex = 0*14, 1*3, 0*2#
  * #incr = ip++*14, ip+=2*3, ip++*2#
+ * #isdatetime = 0*17, 1*2#
  */
 static int
 @fname@_argmax(@type@ *ip, npy_intp n, npy_intp *max_ind,
@@ -3103,6 +3104,12 @@ static int
         return 0;
     }
 #endif
+#if @isdatetime@
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's maximal */
+        return 0;
+    }
+#endif
 
     for (i = 1; i < n; i++) {
         @incr@;
@@ -3122,6 +3129,13 @@ static int
             }
         }
 #else
+#if @isdatetime@
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's maximal */
+            *max_ind = i;
+            break;
+        }
+#endif
         if (!@le@(*ip, mp)) {  /* negated, for correct nan handling */
             mp = *ip;
             *max_ind = i;
@@ -3158,16 +3172,19 @@ BOOL_argmin(npy_bool *ip, npy_intp n, npy_intp *min_ind,
  * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *          LONG, ULONG, LONGLONG, ULONGLONG,
  *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          DATETIME, TIMEDELTA#
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
  *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_float, npy_double, npy_longdouble#
- * #isfloat = 0*10, 1*7#
- * #isnan = nop*10, npy_half_isnan, npy_isnan*6#
- * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*6#
- * #iscomplex = 0*14, 1*3#
- * #incr = ip++*14, ip+=2*3#
+ *         npy_float, npy_double, npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+ * #isfloat = 0*10, 1*7, 0*2#
+ * #isnan = nop*10, npy_half_isnan, npy_isnan*6, nop*2#
+ * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8#
+ * #iscomplex = 0*14, 1*3, 0*2#
+ * #incr = ip++*14, ip+=2*3, ip++*2#
+ * #isdatetime = 0*17, 1*2#
  */
 static int
 @fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind,
@@ -3193,6 +3210,12 @@ static int
         return 0;
     }
 #endif
+#if @isdatetime@
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's minimal */
+        return 0;
+    }
+#endif
 
     for (i = 1; i < n; i++) {
         @incr@;
@@ -3212,6 +3235,13 @@ static int
             }
         }
 #else
+#if @isdatetime@
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's minimal */
+            *min_ind = i;
+            break;
+        }
+#endif 
         if (!@le@(mp, *ip)) {  /* negated, for correct nan handling */
             mp = *ip;
             *min_ind = i;
@@ -3231,43 +3261,6 @@ static int
 
 #undef _LESS_THAN_OR_EQUAL
 
-/**begin repeat
- *
- * #fname = DATETIME, TIMEDELTA#
- * #type = npy_datetime, npy_timedelta#
- */
-static int
-@fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind,
-        PyArrayObject *NPY_UNUSED(aip))
-{
-    /* NPY_DATETIME_NAT is smaller than every other value, we skip
-     * it for consistency with min().
-     */
-    npy_intp i;
-    @type@ mp = NPY_DATETIME_NAT;
-
-    i = 0;
-    while (i < n && mp == NPY_DATETIME_NAT) {
-        mp = ip[i];
-        i++;
-    }
-    if (i == n) {
-        /* All NaTs: return 0 */
-        *min_ind = 0;
-        return 0;
-    }
-    *min_ind = i - 1;
-    for (; i < n; i++) {
-        if (mp > ip[i] && ip[i] != NPY_DATETIME_NAT) {
-            mp = ip[i];
-            *min_ind = i;
-        }
-    }
-    return 0;
-}
-
-/**end repeat**/
-
 static int
 OBJECT_argmax(PyObject **ip, npy_intp n, npy_intp *max_ind,
               PyArrayObject *NPY_UNUSED(aip))
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index d21bb9776..de81bcea1 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -3221,18 +3221,6 @@ NPY_NO_EXPORT PyArrayObject *
 datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
                 PyArray_Descr *dtype)
 {
-    PyArray_DatetimeMetaData meta;
-    /*
-     * Both datetime and timedelta are stored as int64, so they can
-     * share value variables.
-     */
-    npy_int64 values[3];
-    PyObject *objs[3];
-    int type_nums[3];
-
-    npy_intp i, length;
-    PyArrayObject *ret;
-    npy_int64 *ret_data;
 
     /*
      * First normalize the input parameters so there is no Py_None,
@@ -3265,6 +3253,8 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
     /* Check if the units of the given dtype are generic, in which
      * case we use the code path that detects the units
      */
+    int type_nums[3];
+    PyArray_DatetimeMetaData meta;
     if (dtype != NULL) {
         PyArray_DatetimeMetaData *meta_tmp;
 
@@ -3313,6 +3303,7 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
     }
 
     /* Set up to convert the objects to a common datetime unit metadata */
+    PyObject *objs[3];
     objs[0] = start;
     objs[1] = stop;
     objs[2] = step;
@@ -3333,11 +3324,22 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
         type_nums[2] = NPY_TIMEDELTA;
     }
 
-    /* Convert all the arguments */
+    /* Convert all the arguments
+     *
+     * Both datetime and timedelta are stored as int64, so they can
+     * share value variables.
+     */
+    npy_int64 values[3];
     if (convert_pyobjects_to_datetimes(3, objs, type_nums,
                                 NPY_SAME_KIND_CASTING, values, &meta) < 0) {
         return NULL;
     }
+    /* If no start was provided, default to 0 */
+    if (start == NULL) {
+        /* enforced above */
+        assert(type_nums[0] == NPY_TIMEDELTA);
+        values[0] = 0;
+    }
 
     /* If no step was provided, default to 1 */
     if (step == NULL) {
@@ -3362,6 +3364,7 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
     }
 
     /* Calculate the array length */
+    npy_intp length;
     if (values[2] > 0 && values[1] > values[0]) {
         length = (values[1] - values[0] + (values[2] - 1)) / values[2];
     }
@@ -3389,19 +3392,20 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
     }
 
     /* Create the result array */
-    ret = (PyArrayObject *)PyArray_NewFromDescr(
-                            &PyArray_Type, dtype, 1, &length, NULL,
-                            NULL, 0, NULL);
+    PyArrayObject *ret = (PyArrayObject *)PyArray_NewFromDescr(
+            &PyArray_Type, dtype, 1, &length, NULL,
+            NULL, 0, NULL);
+
     if (ret == NULL) {
         return NULL;
     }
 
     if (length > 0) {
         /* Extract the data pointer */
-        ret_data = (npy_int64 *)PyArray_DATA(ret);
+        npy_int64 *ret_data = (npy_int64 *)PyArray_DATA(ret);
 
         /* Create the timedeltas or datetimes */
-        for (i = 0; i < length; ++i) {
+        for (npy_intp i = 0; i < length; ++i) {
             *ret_data = values[0];
             values[0] += values[2];
             ret_data++;
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 734255a9d..23d140cf6 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -1385,7 +1385,6 @@ NPY_NO_EXPORT int
 PyArray_DescrConverter(PyObject *obj, PyArray_Descr **at)
 {
     int check_num = NPY_NOTYPE + 10;
-    PyObject *item;
     int elsize = 0;
     char endian = '=';
 
@@ -1664,16 +1663,22 @@ finish:
         PyErr_Clear();
         /* Now check to see if the object is registered in typeDict */
         if (typeDict != NULL) {
-            item = PyDict_GetItem(typeDict, obj);
+            PyObject *item = NULL;
 #if defined(NPY_PY3K)
-            if (!item && PyBytes_Check(obj)) {
+            if (PyBytes_Check(obj)) {
                 PyObject *tmp;
                 tmp = PyUnicode_FromEncodedObject(obj, "ascii", "strict");
-                if (tmp != NULL) {
-                    item = PyDict_GetItem(typeDict, tmp);
-                    Py_DECREF(tmp);
+                if (tmp == NULL) {
+                    goto fail;
                 }
+                item = PyDict_GetItem(typeDict, tmp);
+                Py_DECREF(tmp);
+            }
+            else {
+                item = PyDict_GetItem(typeDict, obj);
             }
+#else
+            item = PyDict_GetItem(typeDict, obj);
 #endif
             if (item) {
                 /* Check for a deprecated Numeric-style typecode */
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5443223ab..e6d8eca0d 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1294,10 +1294,10 @@ NPY_NO_EXPORT void
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ in2 = *(@type@ *)ip2;
         if (in1 == NPY_DATETIME_NAT) {
-            *((@type@ *)op1) = in2;
+            *((@type@ *)op1) = in1;
         }
         else if (in2 == NPY_DATETIME_NAT) {
-            *((@type@ *)op1) = in1;
+            *((@type@ *)op1) = in2;
         }
         else {
             *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
@@ -1635,6 +1635,30 @@ NPY_NO_EXPORT void
 /**end repeat**/
 
 /**begin repeat
+ *  #func = rint, ceil, floor, trunc#
+ *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ */
+
+/**begin repeat1
+*  #TYPE = FLOAT, DOUBLE#
+*  #type = npy_float, npy_double#
+*  #typesub = f, #
+*/
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *(@type@ *)op1 = @scalarf@@typesub@(in1);
+    }
+}
+
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
  *  #func = sin, cos, exp, log#
  *  #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
  */
@@ -1657,6 +1681,78 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
  */
 
 /**begin repeat1
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #type = npy_float, npy_double#
+ *  #typesub = f, #
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_sqrt_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = npy_sqrt@typesub@(in1);
+        }
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_absolute_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ tmp = in1 > 0 ? in1 : -in1;
+            /* add 0 to clear -0.0 */
+            *((@type@ *)op1) = tmp + 0;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_square_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = in1*in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_reciprocal_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = 1.0f/in1;
+        }
+    }
+}
+
+/**begin repeat2
+ *  #func = rint, ceil, floor, trunc#
+ *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = @scalarf@@typesub@(in1);
+        }
+    }
+}
+
+/**end repeat2**/
+/**end repeat1**/
+
+/**begin repeat1
  *  #func = exp, log#
  *  #scalarf = npy_expf, npy_logf#
  */
@@ -1706,10 +1802,9 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 }
 
 /**end repeat1**/
-
-
 /**end repeat**/
 
+
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble, npy_float#
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 5070ab38b..0ef14a809 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -7,14 +7,12 @@
 #define _NPY_UMATH_LOOPS_H_
 
 #define BOOL_invert BOOL_logical_not
-#define BOOL_negative BOOL_logical_not
 #define BOOL_add BOOL_logical_or
 #define BOOL_bitwise_and BOOL_logical_and
 #define BOOL_bitwise_or BOOL_logical_or
 #define BOOL_logical_xor BOOL_not_equal
 #define BOOL_bitwise_xor BOOL_logical_xor
 #define BOOL_multiply BOOL_logical_and
-#define BOOL_subtract BOOL_logical_xor
 #define BOOL_maximum BOOL_logical_or
 #define BOOL_minimum BOOL_logical_and
 #define BOOL_fmax BOOL_maximum
@@ -175,6 +173,19 @@ NPY_NO_EXPORT void
  */
 NPY_NO_EXPORT void
 @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #isa = avx512f, fma#
+ */
+
+/**begin repeat2
+ * #func = sqrt, absolute, square, reciprocal#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat2**/
+/**end repeat1**/
 /**end repeat**/
 
 /**begin repeat
@@ -194,6 +205,26 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 /**end repeat**/
 
 /**begin repeat
+ *  #func = rint, ceil, floor, trunc#
+ */
+
+/**begin repeat1
+*  #TYPE = FLOAT, DOUBLE#
+*/
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+/**begin repeat2
+ * #isa = avx512f, fma#
+ */
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
  * Float types
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, f, , l#
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index 8d67f96ac..43bed425c 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -494,32 +494,18 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
             }
             else {
                 /* not a tuple */
-                if (nout > 1 && DEPRECATE("passing a single argument to the "
-                                          "'out' keyword argument of a "
-                                          "ufunc with\n"
-                                          "more than one output will "
-                                          "result in an error in the "
-                                          "future") < 0) {
-                    /*
-                     * If the deprecation is removed, also remove the loop
-                     * below setting tuple items to None (but keep this future
-                     * error message.)
-                     */
+                if (nout > 1) {
                     PyErr_SetString(PyExc_TypeError,
                                     "'out' must be a tuple of arguments");
                     goto fail;
                 }
                 if (out != Py_None) {
                     /* not already a tuple and not None */
-                    PyObject *out_tuple = PyTuple_New(nout);
+                    PyObject *out_tuple = PyTuple_New(1);
 
                     if (out_tuple == NULL) {
                         goto fail;
                     }
-                    for (i = 1; i < nout; i++) {
-                        Py_INCREF(Py_None);
-                        PyTuple_SET_ITEM(out_tuple, i, Py_None);
-                    }
                     /* out was borrowed ref; make it permanent */
                     Py_INCREF(out);
                     /* steals reference */
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 88e5e1f1b..74f52cc9d 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -139,6 +139,37 @@ abs_ptrdiff(char *a, char *b)
 /* prototypes */
 
 /**begin repeat1
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ */
+
+/**begin repeat2
+ *  #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc#
+ */
+
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride);
+#endif
+
+static NPY_INLINE int
+run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), @REGISTER_SIZE@)) {
+        @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
+/**end repeat2**/
+/**end repeat1**/
+
+/**begin repeat1
  * #func = exp, log#
  */
 
@@ -185,7 +216,6 @@ run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps,
 /**end repeat**/
 
 
-
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
@@ -1144,41 +1174,76 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_full_load_mask(void)
+fma_get_full_load_mask_ps(void)
 {
     return _mm256_set1_ps(-1.0);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_get_full_load_mask_pd(void)
+{
+    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
+fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                             1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-    float* addr = maskint + total_elem - num_lanes;
+    float* addr = maskint + num_lanes - num_elem;
     return _mm256_loadu_ps(addr);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
+{
+    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
+    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
+    return _mm256_loadu_si256((__m256i*) addr);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_gather(__m256 src,
-                   npy_float* addr,
-                   __m256i vindex,
-                   __m256 mask)
+fma_masked_gather_ps(__m256 src,
+                     npy_float* addr,
+                     __m256i vindex,
+                     __m256 mask)
 {
     return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_masked_gather_pd(__m256d src,
+                     npy_double* addr,
+                     __m128i vindex,
+                     __m256d mask)
+{
+    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_load(__m256 mask, npy_float* addr)
+fma_masked_load_ps(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_masked_load_pd(__m256i mask, npy_double* addr)
+{
+    return _mm256_maskload_pd(addr, mask);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
 {
     return _mm256_blendv_ps(x, val, mask);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
+{
+    return _mm256_blendv_pd(x, val, mask);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_blend(__m256 x, __m256 y, __m256 ymask)
 {
@@ -1186,6 +1251,18 @@ fma_blend(__m256 x, __m256 y, __m256 ymask)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_invert_mask_ps(__m256 ymask)
+{
+    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_invert_mask_pd(__m256i ymask)
+{
+    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
 {
    return _mm256_cvtepi32_ps(
@@ -1290,42 +1367,115 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
      }
 }
 
+/**begin repeat
+ *  #vsub = ps, pd#
+ *  #vtype = __m256, __m256d#
+ */
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_abs_@vsub@(@vtype@ x)
+{
+    return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_reciprocal_@vsub@(@vtype@ x)
+{
+    return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_rint_@vsub@(@vtype@ x)
+{
+    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_floor_@vsub@(@vtype@ x)
+{
+    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_ceil_@vsub@(@vtype@ x)
+{
+    return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_trunc_@vsub@(@vtype@ x)
+{
+    return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
+}
+/**end repeat**/
 #endif
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_full_load_mask(void)
+avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_full_load_mask_pd(void)
+{
+    return 0xFF;
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x0001 << num_elem) - 0x0001;
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x01 << num_elem) - 0x01;
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_gather(__m512 src,
-                     npy_float* addr,
-                     __m512i vindex,
-                     __mmask16 kmask)
+avx512_masked_gather_ps(__m512 src,
+                        npy_float* addr,
+                        __m512i vindex,
+                        __mmask16 kmask)
 {
     return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_masked_gather_pd(__m512d src,
+                        npy_double* addr,
+                        __m256i vindex,
+                        __mmask8 kmask)
+{
+    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_load(__mmask16 mask, npy_float* addr)
+avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+{
+    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask)
+avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
 {
     return _mm512_mask_blend_ps(mask, x, val);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
+{
+    return _mm512_mask_blend_pd(mask, x, val);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 {
@@ -1333,6 +1483,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_invert_mask_ps(__mmask16 ymask)
+{
+    return _mm512_knot(ymask);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_invert_mask_pd(__mmask8 ymask)
+{
+    return _mm512_knot(ymask);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
 {
     return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
@@ -1361,6 +1523,49 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
 {
     return _mm512_scalef_ps(poly, quadrant);
 }
+/**begin repeat
+ *  #vsub  = ps, pd#
+ *  #epi_vsub  = epi32, epi64#
+ *  #vtype = __m512, __m512d#
+ *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
+ */
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_abs_@vsub@(@vtype@ x)
+{
+    return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
+				    _mm512_set1_@epi_vsub@ (@and_const@));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_reciprocal_@vsub@(@vtype@ x)
+{
+    return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_rint_@vsub@(@vtype@ x)
+{
+    return _mm512_roundscale_@vsub@(x, 0x08);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_floor_@vsub@(@vtype@ x)
+{
+    return _mm512_roundscale_@vsub@(x, 0x09);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_ceil_@vsub@(@vtype@ x)
+{
+    return _mm512_roundscale_@vsub@(x, 0x0A);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_trunc_@vsub@(@vtype@ x)
+{
+    return _mm512_roundscale_@vsub@(x, 0x0B);
+}
+/**end repeat**/
 #endif
 
 /**begin repeat
@@ -1438,7 +1643,187 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
     sin = @fmadd@(sin, x, x);
     return sin;
 }
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_sqrt_ps(@vtype@ x)
+{
+    return _mm@vsize@_sqrt_ps(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+@isa@_sqrt_pd(@vtype@d x)
+{
+    return _mm@vsize@_sqrt_pd(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_square_ps(@vtype@ x)
+{
+    return _mm@vsize@_mul_ps(x,x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+@isa@_square_pd(@vtype@d x)
+{
+    return _mm@vsize@_mul_pd(x,x);
+}
+
+#endif
+/**end repeat**/
+
+
+/**begin repeat
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #mask = __m256, __mmask16#
+ * #vsub = , _mask#
+ * #vtype = __m256, __m512#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+/**begin repeat1
+ *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
+ *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
+ *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ */
+
+#if defined @CHK@
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_FLOAT(npy_float* op,
+                   npy_float* ip,
+                   const npy_intp array_size,
+                   const npy_intp steps)
+{
+    const npy_intp stride = steps/sizeof(npy_float);
+    const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+    npy_intp num_remaining_elements = array_size;
+    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
+#if @replace_0_with_1@
+    @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
+#endif
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
+#if @replace_0_with_1@
+            inv_load_mask = @isa@_invert_mask_ps(load_mask);
+#endif
+        }
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load_ps(load_mask, ip);
+#if @replace_0_with_1@
+            /*
+             * Replace masked elements with 1.0f to avoid divide by zero fp
+             * exception in reciprocal
+             */
+            x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
+#endif
+        }
+        else {
+            x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
+        }
+        @vtype@ out = @isa@_@vectorf@_ps(x);
+        @masked_store@(op, @cvtps_epi32@(load_mask), out);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
+#endif
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #mask = __m256i, __mmask8#
+ * #vsub = , _mask#
+ * #vtype = __m256d, __m512d#
+ * #vindextype = __m128i, __m256i#
+ * #vindexsize = 128, 256#
+ * #vindexload = _mm_loadu_si128, _mm256_loadu_si256#
+ * #cvtps_epi32 = _mm256_cvtpd_epi32, #
+ * #castmask = _mm256_castsi256_pd, #
+ * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+/**begin repeat1
+ *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
+ *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
+ *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ */
+
+#if defined @CHK@
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_DOUBLE(npy_double* op,
+                    npy_double* ip,
+                    const npy_intp array_size,
+                    const npy_intp steps)
+{
+    const npy_intp stride = steps/sizeof(npy_double);
+    const npy_int num_lanes = @BYTES@/sizeof(npy_double);
+    npy_intp num_remaining_elements = array_size;
+    @mask@ load_mask = @isa@_get_full_load_mask_pd();
+#if @replace_0_with_1@
+    @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
+#endif
+    @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
+    npy_int indexarr[8];
+    for (npy_int ii = 0; ii < 8; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+    @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
+                                                       num_lanes);
+#if @replace_0_with_1@
+            inv_load_mask = @isa@_invert_mask_pd(load_mask);
 #endif
+        }
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load_pd(load_mask, ip);
+#if @replace_0_with_1@
+            /*
+             * Replace masked elements with 1.0f to avoid divide by zero fp
+             * exception in reciprocal
+             */
+            x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
+#endif
+        }
+        else {
+            x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
+        }
+        @vtype@ out = @isa@_@vectorf@_pd(x);
+        @masked_store@(op, load_mask, out);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
+#endif
+/**end repeat1**/
 /**end repeat**/
 
 /**begin repeat
@@ -1460,7 +1845,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
  */
 
-
 /*
  * Vectorized approximate sine/cosine algorithms: The following code is a
  * vectorized version of the algorithm presented here:
@@ -1519,7 +1903,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
     @vtype@i iquadrant;
     @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
-    @mask@ load_mask = @isa@_get_full_load_mask();
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
     npy_intp num_remaining_elements = array_size;
     npy_int indexarr[16];
     for (npy_int ii = 0; ii < 16; ii++) {
@@ -1530,16 +1914,16 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     while (num_remaining_elements > 0) {
 
         if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                          num_lanes);
         }
 
         @vtype@ x;
         if (stride == 1) {
-            x = @isa@_masked_load(load_mask, ip);
+            x = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
+            x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
         }
 
         /*
@@ -1551,7 +1935,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
         glibc_mask = @and_masks@(load_mask, glibc_mask);
         nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
-        x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask));
+        x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask));
         npy_int iglibc_mask = @mask_to_int@(glibc_mask);
 
         if (iglibc_mask != @full_mask@) {
@@ -1584,7 +1968,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
             /* multiply by -1 for appropriate elements */
             negate_mask = @isa@_should_negate(iquadrant, twos, twos);
             cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
-            cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+            cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
 
             @masked_store@(op, @cvtps_epi32@(load_mask), cos);
         }
@@ -1662,27 +2046,27 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
 
     @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
-    @mask@ overflow_mask = @isa@_get_partial_load_mask(0, num_lanes);
-    @mask@ load_mask = @isa@_get_full_load_mask();
+    @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
     npy_intp num_remaining_elements = array_size;
 
     while (num_remaining_elements > 0) {
 
         if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                    num_lanes);
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
         }
 
         @vtype@ x;
         if (stride == 1) {
-            x = @isa@_masked_load(load_mask, ip);
+            x = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+            x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
         }
 
         nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
-        x = @isa@_set_masked_lanes(x, zeros_f, nan_mask);
+        x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask);
 
         xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
         xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
@@ -1690,7 +2074,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         overflow_mask = @or_masks@(overflow_mask,
                                     @xor_masks@(xmax_mask, inf_mask));
 
-        x = @isa@_set_masked_lanes(x, zeros_f, @or_masks@(
+        x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
                                     @or_masks@(nan_mask, xmin_mask), xmax_mask));
 
         quadrant = _mm@vsize@_mul_ps(x, log2e);
@@ -1723,9 +2107,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
          * elem < xmin; return 0.0f
          * elem = +/- nan, return nan
          */
-        poly = @isa@_set_masked_lanes(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
-        poly = @isa@_set_masked_lanes(poly, inf, xmax_mask);
-        poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask);
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
@@ -1790,24 +2174,24 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ poly, num_poly, denom_poly, exponent;
 
     @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
-    @mask@ invalid_mask = @isa@_get_partial_load_mask(0, num_lanes);
+    @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
     @mask@ divide_by_zero_mask = invalid_mask;
-    @mask@ load_mask = @isa@_get_full_load_mask();
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
     npy_intp num_remaining_elements = array_size;
 
     while (num_remaining_elements > 0) {
 
         if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                    num_lanes);
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
         }
 
         @vtype@ x_in;
         if (stride == 1) {
-            x_in = @isa@_masked_load(load_mask, ip);
+            x_in = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x_in  = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+            x_in  = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
         }
 
         negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
@@ -1818,7 +2202,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                                         @and_masks@(zero_mask, load_mask));
         invalid_mask = @or_masks@(invalid_mask, negx_mask);
 
-        @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask);
+        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask);
 
         /* set x = normalized mantissa */
         exponent = @isa@_get_exponent(x);
@@ -1852,10 +2236,10 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
          * x = +/- NAN; return NAN
          * x = 0.0f; return -INF
          */
-        poly = @isa@_set_masked_lanes(poly, nan, nan_mask);
-        poly = @isa@_set_masked_lanes(poly, neg_nan, negx_mask);
-        poly = @isa@_set_masked_lanes(poly, neg_inf, zero_mask);
-        poly = @isa@_set_masked_lanes(poly, inf, inf_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, nan, nan_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, neg_nan, negx_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, neg_inf, zero_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, inf, inf_mask);
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index 9be7b63a0..f93d8229e 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -883,7 +883,7 @@ PyUFunc_SubtractionTypeResolver(PyUFuncObject *ufunc,
         /* The type resolver would have upcast already */
         if (out_dtypes[0]->type_num == NPY_BOOL) {
             PyErr_Format(PyExc_TypeError,
-                "numpy boolean subtract, the `-` operator, is deprecated, "
+                "numpy boolean subtract, the `-` operator, is not supported, "
                 "use the bitwise_xor, the `^` operator, or the logical_xor "
                 "function instead.");
             return -1;
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index f99c0f72b..11f900c5f 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -1333,10 +1333,10 @@ class TestDateTime(object):
         # Interaction with NaT
         a = np.array('1999-03-12T13', dtype='M8[2m]')
         dtnat = np.array('NaT', dtype='M8[h]')
-        assert_equal(np.minimum(a, dtnat), a)
-        assert_equal(np.minimum(dtnat, a), a)
-        assert_equal(np.maximum(a, dtnat), a)
-        assert_equal(np.maximum(dtnat, a), a)
+        assert_equal(np.minimum(a, dtnat), dtnat)
+        assert_equal(np.minimum(dtnat, a), dtnat)
+        assert_equal(np.maximum(a, dtnat), dtnat)
+        assert_equal(np.maximum(dtnat, a), dtnat)
 
         # Also do timedelta
         a = np.array(3, dtype='m8[h]')
@@ -1831,7 +1831,7 @@ class TestDateTime(object):
     def test_timedelta_arange_no_dtype(self):
         d = np.array(5, dtype="m8[D]")
         assert_equal(np.arange(d, d + 1), d)
-        assert_raises(ValueError, np.arange, d)
+        assert_equal(np.arange(d), np.arange(0, d))
 
     def test_datetime_maximum_reduce(self):
         a = np.array(['2010-01-02', '1999-03-14', '1833-03'], dtype='M8[D]')
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 9b124f603..c699a9bc1 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -3602,10 +3602,10 @@ class TestBinop(object):
         assert_equal(np.modf(dummy, out=(None, a)), (1,))
         assert_equal(np.modf(dummy, out=(dummy, a)), (1,))
         assert_equal(np.modf(a, out=(dummy, a)), 0)
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', DeprecationWarning)
-            assert_equal(np.modf(dummy, out=a), (0,))
-            assert_(w[0].category is DeprecationWarning)
+        with assert_raises(TypeError):
+            # Out argument must be tuple, since there are multiple outputs
+            np.modf(dummy, out=a)
+
         assert_raises(ValueError, np.modf, dummy, out=(a,))
 
         # 2 inputs, 1 output
@@ -4105,17 +4105,17 @@ class TestArgmax(object):
           np.datetime64('2010-01-03T05:14:12'),
           np.datetime64('NaT'),
           np.datetime64('2015-09-23T10:10:13'),
-          np.datetime64('1932-10-10T03:50:30')], 4),
+          np.datetime64('1932-10-10T03:50:30')], 0),
         ([np.datetime64('2059-03-14T12:43:12'),
           np.datetime64('1996-09-21T14:43:15'),
           np.datetime64('NaT'),
           np.datetime64('2022-12-25T16:02:16'),
           np.datetime64('1963-10-04T03:14:12'),
-          np.datetime64('2013-05-08T18:15:23')], 0),
+          np.datetime64('2013-05-08T18:15:23')], 2),
         ([np.timedelta64(2, 's'),
           np.timedelta64(1, 's'),
           np.timedelta64('NaT', 's'),
-          np.timedelta64(3, 's')], 3),
+          np.timedelta64(3, 's')], 2),
         ([np.timedelta64('NaT', 's')] * 3, 0),
 
         ([timedelta(days=5, seconds=14), timedelta(days=2, seconds=35),
@@ -4240,17 +4240,17 @@ class TestArgmin(object):
           np.datetime64('2010-01-03T05:14:12'),
           np.datetime64('NaT'),
           np.datetime64('2015-09-23T10:10:13'),
-          np.datetime64('1932-10-10T03:50:30')], 5),
+          np.datetime64('1932-10-10T03:50:30')], 0),
         ([np.datetime64('2059-03-14T12:43:12'),
           np.datetime64('1996-09-21T14:43:15'),
           np.datetime64('NaT'),
           np.datetime64('2022-12-25T16:02:16'),
           np.datetime64('1963-10-04T03:14:12'),
-          np.datetime64('2013-05-08T18:15:23')], 4),
+          np.datetime64('2013-05-08T18:15:23')], 2),
         ([np.timedelta64(2, 's'),
           np.timedelta64(1, 's'),
           np.timedelta64('NaT', 's'),
-          np.timedelta64(3, 's')], 1),
+          np.timedelta64(3, 's')], 2),
         ([np.timedelta64('NaT', 's')] * 3, 0),
 
         ([timedelta(days=5, seconds=14), timedelta(days=2, seconds=35),
@@ -4366,18 +4366,14 @@ class TestMinMax(object):
         assert_equal(np.amax([[1, 2, 3]], axis=1), 3)
 
     def test_datetime(self):
-        # NaTs are ignored
+        # Do not ignore NaT
         for dtype in ('m8[s]', 'm8[Y]'):
             a = np.arange(10).astype(dtype)
-            a[3] = 'NaT'
             assert_equal(np.amin(a), a[0])
             assert_equal(np.amax(a), a[9])
-            a[0] = 'NaT'
-            assert_equal(np.amin(a), a[1])
-            assert_equal(np.amax(a), a[9])
-            a.fill('NaT')
-            assert_equal(np.amin(a), a[0])
-            assert_equal(np.amax(a), a[0])
+            a[3] = 'NaT'
+            assert_equal(np.amin(a), a[3])
+            assert_equal(np.amax(a), a[3])
 
 
 class TestNewaxis(object):
@@ -7975,6 +7971,8 @@ class TestFormat(object):
                 dst = object.__format__(a, '30')
                 assert_equal(res, dst)
 
+from numpy.testing import IS_PYPY
+
 class TestCTypes(object):
 
     def test_ctypes_is_available(self):
@@ -8041,7 +8039,29 @@ class TestCTypes(object):
 
         # but when the `ctypes_ptr` object dies, so should `arr`
         del ctypes_ptr
+        if IS_PYPY:
+            # Pypy does not recycle arr objects immediately. Trigger gc to
+            # release arr. Cpython uses refcounts. An explicit call to gc
+            # should not be needed here.
+            break_cycles()
+        assert_(arr_ref() is None, "unknowable whether ctypes pointer holds a reference")
+
+    def test_ctypes_as_parameter_holds_reference(self):
+        arr = np.array([None]).copy()
+
+        arr_ref = weakref.ref(arr)
+
+        ctypes_ptr = arr.ctypes._as_parameter_
+
+        # `ctypes_ptr` should hold onto `arr`
+        del arr
         break_cycles()
+        assert_(arr_ref() is not None, "ctypes pointer did not hold onto a reference")
+
+        # but when the `ctypes_ptr` object dies, so should `arr`
+        del ctypes_ptr
+        if IS_PYPY:
+            break_cycles()
         assert_(arr_ref() is None, "unknowable whether ctypes pointer holds a reference")
 
 
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index a646e5e45..9b4ce9e47 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -687,8 +687,96 @@ class TestSpecialFloats(object):
             assert_raises(FloatingPointError, np.cos, np.float32(-np.inf))
             assert_raises(FloatingPointError, np.cos, np.float32(np.inf))
 
+    def test_sqrt_values(self):
+        with np.errstate(all='ignore'):
+            x = [np.nan,  np.nan, np.inf, np.nan, 0.]
+            y = [np.nan, -np.nan, np.inf, -np.inf, 0.]
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.sqrt(yf), xf)
+
+        #with np.errstate(invalid='raise'):
+        #    for dt in ['f', 'd', 'g']:
+        #        assert_raises(FloatingPointError, np.sqrt, np.array(-100., dtype=dt))
+
+    def test_abs_values(self):
+        x = [np.nan,  np.nan, np.inf, np.inf, 0., 0., 1.0, 1.0]
+        y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0., -1.0, 1.0]
+        for dt in ['f', 'd', 'g']:
+            xf = np.array(x, dtype=dt)
+            yf = np.array(y, dtype=dt)
+            assert_equal(np.abs(yf), xf)
+
+    def test_square_values(self):
+        x = [np.nan,  np.nan, np.inf, np.inf]
+        y = [np.nan, -np.nan, np.inf, -np.inf]
+        with np.errstate(all='ignore'):
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.square(yf), xf)
+
+        with np.errstate(over='raise'):
+            assert_raises(FloatingPointError, np.square, np.array(1E32,  dtype='f'))
+            assert_raises(FloatingPointError, np.square, np.array(1E200, dtype='d'))
 
-class TestSIMDFloat32(object):
+    def test_reciprocal_values(self):
+        with np.errstate(all='ignore'):
+            x = [np.nan,  np.nan, 0.0, -0.0, np.inf, -np.inf]
+            y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0.]
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.reciprocal(yf), xf)
+
+        with np.errstate(divide='raise'):
+            for dt in ['f', 'd', 'g']:
+                assert_raises(FloatingPointError, np.reciprocal, np.array(-0.0, dtype=dt))
+
+# func : [maxulperror, low, high]
+avx_ufuncs = {'sqrt'        :[1,  0.,   100.],
+              'absolute'    :[0, -100., 100.],
+              'reciprocal'  :[1,  1.,   100.],
+              'square'      :[1, -100., 100.],
+              'rint'        :[0, -100., 100.],
+              'floor'       :[0, -100., 100.],
+              'ceil'        :[0, -100., 100.],
+              'trunc'       :[0, -100., 100.]}
+
+class TestAVXUfuncs(object):
+    def test_avx_based_ufunc(self):
+        strides = np.array([-4,-3,-2,-1,1,2,3,4])
+        np.random.seed(42)
+        for func, prop in avx_ufuncs.items():
+            maxulperr = prop[0]
+            minval = prop[1]
+            maxval = prop[2]
+            # various array sizes to ensure masking in AVX is tested
+            for size in range(1,32):
+                myfunc = getattr(np, func)
+                x_f32 = np.float32(np.random.uniform(low=minval, high=maxval,
+                    size=size))
+                x_f64 = np.float64(x_f32)
+                x_f128 = np.longdouble(x_f32)
+                y_true128 = myfunc(x_f128)
+                if maxulperr == 0:
+                    assert_equal(myfunc(x_f32), np.float32(y_true128))
+                    assert_equal(myfunc(x_f64), np.float64(y_true128))
+                else:
+                    assert_array_max_ulp(myfunc(x_f32), np.float32(y_true128),
+                            maxulp=maxulperr)
+                    assert_array_max_ulp(myfunc(x_f64), np.float64(y_true128),
+                            maxulp=maxulperr)
+                # various strides to test gather instruction
+                if size > 1:
+                    y_true32 = myfunc(x_f32)
+                    y_true64 = myfunc(x_f64)
+                    for jj in strides:
+                        assert_equal(myfunc(x_f64[::jj]), y_true64[::jj])
+                        assert_equal(myfunc(x_f32[::jj]), y_true32[::jj])
+
+class TestAVXFloat32Transcendental(object):
     def test_exp_float32(self):
         np.random.seed(42)
         x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
@@ -715,8 +803,8 @@ class TestSIMDFloat32(object):
 
     def test_strided_float32(self):
         np.random.seed(42)
-        strides = np.random.randint(low=-100, high=100, size=100)
-        sizes = np.random.randint(low=1, high=2000, size=100)
+        strides = np.array([-4,-3,-2,-1,1,2,3,4])
+        sizes = np.arange(2,100)
         for ii in sizes:
             x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
             exp_true = np.exp(x_f32)
diff --git a/numpy/f2py/__init__.py b/numpy/f2py/__init__.py
index d146739bb..42e3632fd 100644
--- a/numpy/f2py/__init__.py
+++ b/numpy/f2py/__init__.py
@@ -109,6 +109,7 @@ def compile(source,
             output = ''
         else:
             status = 0
+            output = output.decode()
         if verbose:
             print(output)
     finally:
diff --git a/numpy/f2py/cfuncs.py b/numpy/f2py/cfuncs.py
index 17f3861ca..ccb7b3a32 100644
--- a/numpy/f2py/cfuncs.py
+++ b/numpy/f2py/cfuncs.py
@@ -1049,8 +1049,10 @@ static int create_cb_arglist(PyObject* fun,PyTupleObject* xa,const int maxnofarg
     CFUNCSMESS(\"create_cb_arglist\\n\");
     tot=opt=ext=siz=0;
     /* Get the total number of arguments */
-    if (PyFunction_Check(fun))
+    if (PyFunction_Check(fun)) {
         tmp_fun = fun;
+        Py_INCREF(tmp_fun);
+    }
     else {
         di = 1;
         if (PyObject_HasAttrString(fun,\"im_func\")) {
@@ -1062,6 +1064,7 @@ static int create_cb_arglist(PyObject* fun,PyTupleObject* xa,const int maxnofarg
                 tmp_fun = PyObject_GetAttrString(tmp,\"im_func\");
             else {
                 tmp_fun = fun; /* built-in function */
+                Py_INCREF(tmp_fun);
                 tot = maxnofargs;
                 if (xa != NULL)
                     tot += PyTuple_Size((PyObject *)xa);
@@ -1073,6 +1076,7 @@ static int create_cb_arglist(PyObject* fun,PyTupleObject* xa,const int maxnofarg
             if (xa != NULL)
                 tot += PyTuple_Size((PyObject *)xa);
             tmp_fun = fun;
+            Py_INCREF(tmp_fun);
         }
         else if (F2PyCapsule_Check(fun)) {
             tot = maxnofargs;
@@ -1083,6 +1087,7 @@ static int create_cb_arglist(PyObject* fun,PyTupleObject* xa,const int maxnofarg
                 goto capi_fail;
             }
             tmp_fun = fun;
+            Py_INCREF(tmp_fun);
         }
     }
 if (tmp_fun==NULL) {
@@ -1091,13 +1096,19 @@ goto capi_fail;
 }
 #if PY_VERSION_HEX >= 0x03000000
     if (PyObject_HasAttrString(tmp_fun,\"__code__\")) {
-        if (PyObject_HasAttrString(tmp = PyObject_GetAttrString(tmp_fun,\"__code__\"),\"co_argcount\"))
+        if (PyObject_HasAttrString(tmp = PyObject_GetAttrString(tmp_fun,\"__code__\"),\"co_argcount\")) {
 #else
     if (PyObject_HasAttrString(tmp_fun,\"func_code\")) {
-        if (PyObject_HasAttrString(tmp = PyObject_GetAttrString(tmp_fun,\"func_code\"),\"co_argcount\"))
+        if (PyObject_HasAttrString(tmp = PyObject_GetAttrString(tmp_fun,\"func_code\"),\"co_argcount\")) {
 #endif
-            tot = PyInt_AsLong(PyObject_GetAttrString(tmp,\"co_argcount\")) - di;
-        Py_XDECREF(tmp);
+            PyObject *tmp_argcount = PyObject_GetAttrString(tmp,\"co_argcount\");
+            Py_DECREF(tmp);
+            if (tmp_argcount == NULL) {
+                goto capi_fail;
+            }
+            tot = PyInt_AsLong(tmp_argcount) - di;
+            Py_DECREF(tmp_argcount);
+        }
     }
     /* Get the number of optional arguments */
 #if PY_VERSION_HEX >= 0x03000000
@@ -1136,10 +1147,12 @@ goto capi_fail;
             PyTuple_SET_ITEM(*args,i,tmp);
         }
     CFUNCSMESS(\"create_cb_arglist-end\\n\");
+    Py_DECREF(tmp_fun);
     return 1;
 capi_fail:
     if ((PyErr_Occurred())==NULL)
         PyErr_SetString(#modulename#_error,errmess);
+    Py_XDECREF(tmp_fun);
     return 0;
 }
 """
diff --git a/numpy/f2py/common_rules.py b/numpy/f2py/common_rules.py
index 62c1ba207..f61d8810a 100644
--- a/numpy/f2py/common_rules.py
+++ b/numpy/f2py/common_rules.py
@@ -124,8 +124,9 @@ def buildhooks(m):
         cadd('\t%s(f2pyinit%s,F2PYINIT%s)(f2py_setup_%s);'
              % (F_FUNC, lower_name, name.upper(), name))
         cadd('}\n')
-        iadd('\tF2PyDict_SetItemString(d, \"%s\", PyFortranObject_New(f2py_%s_def,f2py_init_%s));' % (
-            name, name, name))
+        iadd('\ttmp = PyFortranObject_New(f2py_%s_def,f2py_init_%s);' % (name, name))
+        iadd('\tF2PyDict_SetItemString(d, \"%s\", tmp);' % name)
+        iadd('\tPy_DECREF(tmp);')
         tname = name.replace('_', '\\_')
         dadd('\\subsection{Common block \\texttt{%s}}\n' % (tname))
         dadd('\\begin{description}')
diff --git a/numpy/f2py/rules.py b/numpy/f2py/rules.py
index 1b41498ea..f2f713bde 100755
--- a/numpy/f2py/rules.py
+++ b/numpy/f2py/rules.py
@@ -215,6 +215,7 @@ PyMODINIT_FUNC init#modulename#(void) {
 \td = PyModule_GetDict(m);
 \ts = PyString_FromString(\"$R""" + """evision: $\");
 \tPyDict_SetItemString(d, \"__version__\", s);
+\tPy_DECREF(s);
 #if PY_VERSION_HEX >= 0x03000000
 \ts = PyUnicode_FromString(
 #else
@@ -222,8 +223,14 @@ PyMODINIT_FUNC init#modulename#(void) {
 #endif
 \t\t\"This module '#modulename#' is auto-generated with f2py (version:#f2py_version#).\\nFunctions:\\n\"\n#docs#\".\");
 \tPyDict_SetItemString(d, \"__doc__\", s);
-\t#modulename#_error = PyErr_NewException (\"#modulename#.error\", NULL, NULL);
 \tPy_DECREF(s);
+\t#modulename#_error = PyErr_NewException (\"#modulename#.error\", NULL, NULL);
+\t/*
+\t * Store the error object inside the dict, so that it could get deallocated.
+\t * (in practice, this is a module, so it likely will not and cannot.)
+\t */
+\tPyDict_SetItemString(d, \"_#modulename#_error\", #modulename#_error);
+\tPy_DECREF(#modulename#_error);
 \tfor(i=0;f2py_routine_defs[i].name!=NULL;i++) {
 \t\ttmp = PyFortranObject_NewAsAttr(&f2py_routine_defs[i]);
 \t\tPyDict_SetItemString(d, f2py_routine_defs[i].name, tmp);
@@ -238,7 +245,6 @@ PyMODINIT_FUNC init#modulename#(void) {
 \tif (! PyErr_Occurred())
 \t\ton_exit(f2py_report_on_exit,(void*)\"#modulename#\");
 #endif
-
 \treturn RETVAL;
 }
 #ifdef __cplusplus
@@ -439,12 +445,16 @@ rout_rules = [
     {
       extern #ctype# #F_FUNC#(#name_lower#,#NAME#)(void);
       PyObject* o = PyDict_GetItemString(d,"#name#");
-      PyObject_SetAttrString(o,"_cpointer", F2PyCapsule_FromVoidPtr((void*)#F_FUNC#(#name_lower#,#NAME#),NULL));
+      tmp = F2PyCapsule_FromVoidPtr((void*)#F_FUNC#(#name_lower#,#NAME#),NULL);
+      PyObject_SetAttrString(o,"_cpointer", tmp);
+      Py_DECREF(tmp);
 #if PY_VERSION_HEX >= 0x03000000
-      PyObject_SetAttrString(o,"__name__", PyUnicode_FromString("#name#"));
+      s = PyUnicode_FromString("#name#");
 #else
-      PyObject_SetAttrString(o,"__name__", PyString_FromString("#name#"));
+      s = PyString_FromString("#name#");
 #endif
+      PyObject_SetAttrString(o,"__name__", s);
+      Py_DECREF(s);
     }
     '''},
         'need': {l_not(l_or(ismoduleroutine, isdummyroutine)): ['F_WRAPPEDFUNC', 'F_FUNC']},
@@ -477,12 +487,16 @@ rout_rules = [
     {
       extern void #F_FUNC#(#name_lower#,#NAME#)(void);
       PyObject* o = PyDict_GetItemString(d,"#name#");
-      PyObject_SetAttrString(o,"_cpointer", F2PyCapsule_FromVoidPtr((void*)#F_FUNC#(#name_lower#,#NAME#),NULL));
+      tmp = F2PyCapsule_FromVoidPtr((void*)#F_FUNC#(#name_lower#,#NAME#),NULL);
+      PyObject_SetAttrString(o,"_cpointer", tmp);
+      Py_DECREF(tmp);
 #if PY_VERSION_HEX >= 0x03000000
-      PyObject_SetAttrString(o,"__name__", PyUnicode_FromString("#name#"));
+      s = PyUnicode_FromString("#name#");
 #else
-      PyObject_SetAttrString(o,"__name__", PyString_FromString("#name#"));
+      s = PyString_FromString("#name#");
 #endif
+      PyObject_SetAttrString(o,"__name__", s);
+      Py_DECREF(s);
     }
     '''},
         'need': {l_not(l_or(ismoduleroutine, isdummyroutine)): ['F_WRAPPEDFUNC', 'F_FUNC']},
@@ -794,10 +808,13 @@ if (#varname#_capi==Py_None) {
     if (#varname#_xa_capi==NULL) {
       if (PyObject_HasAttrString(#modulename#_module,\"#varname#_extra_args\")) {
         PyObject* capi_tmp = PyObject_GetAttrString(#modulename#_module,\"#varname#_extra_args\");
-        if (capi_tmp)
+        if (capi_tmp) {
           #varname#_xa_capi = (PyTupleObject *)PySequence_Tuple(capi_tmp);
-        else
+          Py_DECREF(capi_tmp);
+        }
+        else {
           #varname#_xa_capi = (PyTupleObject *)Py_BuildValue(\"()\");
+        }
         if (#varname#_xa_capi==NULL) {
           PyErr_SetString(#modulename#_error,\"Failed to convert #modulename#.#varname#_extra_args to tuple.\\n\");
           return NULL;
diff --git a/numpy/f2py/src/fortranobject.c b/numpy/f2py/src/fortranobject.c
index b55385b50..8aa55555d 100644
--- a/numpy/f2py/src/fortranobject.c
+++ b/numpy/f2py/src/fortranobject.c
@@ -39,19 +39,33 @@ PyFortranObject_New(FortranDataDef* defs, f2py_void_func init) {
     int i;
     PyFortranObject *fp = NULL;
     PyObject *v = NULL;
-    if (init!=NULL)                           /* Initialize F90 module objects */
+    if (init!=NULL) {                        /* Initialize F90 module objects */
         (*(init))();
-    if ((fp = PyObject_New(PyFortranObject, &PyFortran_Type))==NULL) return NULL;
-    if ((fp->dict = PyDict_New())==NULL) return NULL;
+    }
+    fp = PyObject_New(PyFortranObject, &PyFortran_Type);
+    if (fp == NULL) {
+        return NULL;
+    }
+    if ((fp->dict = PyDict_New()) == NULL) {
+        Py_DECREF(fp);
+        return NULL;
+    }
     fp->len = 0;
-    while (defs[fp->len].name != NULL) fp->len++;
-    if (fp->len == 0) goto fail;
+    while (defs[fp->len].name != NULL) {
+        fp->len++;
+    }
+    if (fp->len == 0) {
+        goto fail;
+    }
     fp->defs = defs;
-    for (i=0;i<fp->len;i++)
+    for (i=0;i<fp->len;i++) {
         if (fp->defs[i].rank == -1) {                      /* Is Fortran routine */
             v = PyFortranObject_NewAsAttr(&(fp->defs[i]));
-            if (v==NULL) return NULL;
+            if (v==NULL) {
+                goto fail;
+            }
             PyDict_SetItemString(fp->dict,fp->defs[i].name,v);
+            Py_XDECREF(v);
         } else
             if ((fp->defs[i].data)!=NULL) { /* Is Fortran variable or array (not allocatable) */
                 if (fp->defs[i].type == NPY_STRING) {
@@ -65,13 +79,16 @@ PyFortranObject_New(FortranDataDef* defs, f2py_void_func init) {
                                     fp->defs[i].type, NULL, fp->defs[i].data, 0, NPY_ARRAY_FARRAY,
                                     NULL);
                 }
-                if (v==NULL) return NULL;
+                if (v==NULL) {
+                    goto fail;
+                }
                 PyDict_SetItemString(fp->dict,fp->defs[i].name,v);
+                Py_XDECREF(v);
             }
-    Py_XDECREF(v);
+    }
     return (PyObject *)fp;
  fail:
-    Py_XDECREF(v);
+    Py_XDECREF(fp);
     return NULL;
 }
 
diff --git a/numpy/f2py/src/test/foomodule.c b/numpy/f2py/src/test/foomodule.c
index 733fab0be..caf3590d4 100644
--- a/numpy/f2py/src/test/foomodule.c
+++ b/numpy/f2py/src/test/foomodule.c
@@ -115,7 +115,7 @@ static PyMethodDef foo_module_methods[] = {
 
 void initfoo() {
     int i;
-    PyObject *m, *d, *s;
+    PyObject *m, *d, *s, *tmp;
     import_array();
 
     m = Py_InitModule("foo", foo_module_methods);
@@ -125,11 +125,17 @@ void initfoo() {
     PyDict_SetItemString(d, "__doc__", s);
 
     /* Fortran objects: */
-    PyDict_SetItemString(d, "mod", PyFortranObject_New(f2py_mod_def,f2py_init_mod));
-    PyDict_SetItemString(d, "foodata", PyFortranObject_New(f2py_foodata_def,f2py_init_foodata));
-    for(i=0;f2py_routines_def[i].name!=NULL;i++)
-        PyDict_SetItemString(d, f2py_routines_def[i].name,
-                             PyFortranObject_NewAsAttr(&f2py_routines_def[i]));
+    tmp = PyFortranObject_New(f2py_mod_def,f2py_init_mod);
+    PyDict_SetItemString(d, "mod", tmp);
+    Py_DECREF(tmp);
+    tmp = PyFortranObject_New(f2py_foodata_def,f2py_init_foodata);
+    PyDict_SetItemString(d, "foodata", tmp);
+    Py_DECREF(tmp);
+    for(i=0;f2py_routines_def[i].name!=NULL;i++) {
+        tmp = PyFortranObject_NewAsAttr(&f2py_routines_def[i]);
+        PyDict_SetItemString(d, f2py_routines_def[i].name, tmp);
+        Py_DECREF(tmp);
+    }
 
     Py_DECREF(s);
 
diff --git a/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c b/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
index 7f46303b0..978db4e69 100644
--- a/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
+++ b/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
@@ -49,9 +49,18 @@ static PyObject *f2py_rout_wrap_call(PyObject *capi_self,
     return NULL;
   rank = PySequence_Length(dims_capi);
   dims = malloc(rank*sizeof(npy_intp));
-  for (i=0;i<rank;++i)
-    dims[i] = (npy_intp)PyInt_AsLong(PySequence_GetItem(dims_capi,i));
-
+  for (i=0;i<rank;++i) {
+    PyObject *tmp;
+    tmp = PySequence_GetItem(dims_capi, i);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    dims[i] = (npy_intp)PyInt_AsLong(tmp);
+    Py_DECREF(tmp);
+    if (dims[i] == -1 && PyErr_Occurred()) {
+        goto fail;
+    }
+  }
   capi_arr_tmp = array_from_pyobj(type_num,dims,rank,intent|F2PY_INTENT_OUT,arr_capi);
   if (capi_arr_tmp == NULL) {
     free(dims);
@@ -60,6 +69,10 @@ static PyObject *f2py_rout_wrap_call(PyObject *capi_self,
   capi_buildvalue = Py_BuildValue("N",capi_arr_tmp);
   free(dims);
   return capi_buildvalue;
+
+fail:
+  free(dims);
+  return NULL;
 }
 
 static char doc_f2py_rout_wrap_attrs[] = "\
@@ -97,7 +110,7 @@ static PyObject *f2py_rout_wrap_attrs(PyObject *capi_self,
     PyTuple_SetItem(dimensions,i,PyInt_FromLong(PyArray_DIM(arr,i)));
     PyTuple_SetItem(strides,i,PyInt_FromLong(PyArray_STRIDE(arr,i)));
   }
-  return Py_BuildValue("siOOO(cciii)ii",s,PyArray_NDIM(arr),
+  return Py_BuildValue("siNNO(cciii)ii",s,PyArray_NDIM(arr),
                        dimensions,strides,
                        (PyArray_BASE(arr)==NULL?Py_None:PyArray_BASE(arr)),
                        PyArray_DESCR(arr)->kind,
@@ -154,61 +167,69 @@ PyMODINIT_FUNC inittest_array_from_pyobj_ext(void) {
   PyDict_SetItemString(d, "__doc__", s);
   wrap_error = PyErr_NewException ("wrap.error", NULL, NULL);
   Py_DECREF(s);
-  PyDict_SetItemString(d, "F2PY_INTENT_IN", PyInt_FromLong(F2PY_INTENT_IN));
-  PyDict_SetItemString(d, "F2PY_INTENT_INOUT", PyInt_FromLong(F2PY_INTENT_INOUT));
-  PyDict_SetItemString(d, "F2PY_INTENT_OUT", PyInt_FromLong(F2PY_INTENT_OUT));
-  PyDict_SetItemString(d, "F2PY_INTENT_HIDE", PyInt_FromLong(F2PY_INTENT_HIDE));
-  PyDict_SetItemString(d, "F2PY_INTENT_CACHE", PyInt_FromLong(F2PY_INTENT_CACHE));
-  PyDict_SetItemString(d, "F2PY_INTENT_COPY", PyInt_FromLong(F2PY_INTENT_COPY));
-  PyDict_SetItemString(d, "F2PY_INTENT_C", PyInt_FromLong(F2PY_INTENT_C));
-  PyDict_SetItemString(d, "F2PY_OPTIONAL", PyInt_FromLong(F2PY_OPTIONAL));
-  PyDict_SetItemString(d, "F2PY_INTENT_INPLACE", PyInt_FromLong(F2PY_INTENT_INPLACE));
-  PyDict_SetItemString(d, "NPY_BOOL", PyInt_FromLong(NPY_BOOL));
-  PyDict_SetItemString(d, "NPY_BYTE", PyInt_FromLong(NPY_BYTE));
-  PyDict_SetItemString(d, "NPY_UBYTE", PyInt_FromLong(NPY_UBYTE));
-  PyDict_SetItemString(d, "NPY_SHORT", PyInt_FromLong(NPY_SHORT));
-  PyDict_SetItemString(d, "NPY_USHORT", PyInt_FromLong(NPY_USHORT));
-  PyDict_SetItemString(d, "NPY_INT", PyInt_FromLong(NPY_INT));
-  PyDict_SetItemString(d, "NPY_UINT", PyInt_FromLong(NPY_UINT));
-  PyDict_SetItemString(d, "NPY_INTP", PyInt_FromLong(NPY_INTP));
-  PyDict_SetItemString(d, "NPY_UINTP", PyInt_FromLong(NPY_UINTP));
-  PyDict_SetItemString(d, "NPY_LONG", PyInt_FromLong(NPY_LONG));
-  PyDict_SetItemString(d, "NPY_ULONG", PyInt_FromLong(NPY_ULONG));
-  PyDict_SetItemString(d, "NPY_LONGLONG", PyInt_FromLong(NPY_LONGLONG));
-  PyDict_SetItemString(d, "NPY_ULONGLONG", PyInt_FromLong(NPY_ULONGLONG));
-  PyDict_SetItemString(d, "NPY_FLOAT", PyInt_FromLong(NPY_FLOAT));
-  PyDict_SetItemString(d, "NPY_DOUBLE", PyInt_FromLong(NPY_DOUBLE));
-  PyDict_SetItemString(d, "NPY_LONGDOUBLE", PyInt_FromLong(NPY_LONGDOUBLE));
-  PyDict_SetItemString(d, "NPY_CFLOAT", PyInt_FromLong(NPY_CFLOAT));
-  PyDict_SetItemString(d, "NPY_CDOUBLE", PyInt_FromLong(NPY_CDOUBLE));
-  PyDict_SetItemString(d, "NPY_CLONGDOUBLE", PyInt_FromLong(NPY_CLONGDOUBLE));
-  PyDict_SetItemString(d, "NPY_OBJECT", PyInt_FromLong(NPY_OBJECT));
-  PyDict_SetItemString(d, "NPY_STRING", PyInt_FromLong(NPY_STRING));
-  PyDict_SetItemString(d, "NPY_UNICODE", PyInt_FromLong(NPY_UNICODE));
-  PyDict_SetItemString(d, "NPY_VOID", PyInt_FromLong(NPY_VOID));
-  PyDict_SetItemString(d, "NPY_NTYPES", PyInt_FromLong(NPY_NTYPES));
-  PyDict_SetItemString(d, "NPY_NOTYPE", PyInt_FromLong(NPY_NOTYPE));
-  PyDict_SetItemString(d, "NPY_USERDEF", PyInt_FromLong(NPY_USERDEF));
-
-  PyDict_SetItemString(d, "CONTIGUOUS", PyInt_FromLong(NPY_ARRAY_C_CONTIGUOUS));
-  PyDict_SetItemString(d, "FORTRAN", PyInt_FromLong(NPY_ARRAY_F_CONTIGUOUS));
-  PyDict_SetItemString(d, "OWNDATA", PyInt_FromLong(NPY_ARRAY_OWNDATA));
-  PyDict_SetItemString(d, "FORCECAST", PyInt_FromLong(NPY_ARRAY_FORCECAST));
-  PyDict_SetItemString(d, "ENSURECOPY", PyInt_FromLong(NPY_ARRAY_ENSURECOPY));
-  PyDict_SetItemString(d, "ENSUREARRAY", PyInt_FromLong(NPY_ARRAY_ENSUREARRAY));
-  PyDict_SetItemString(d, "ALIGNED", PyInt_FromLong(NPY_ARRAY_ALIGNED));
-  PyDict_SetItemString(d, "WRITEABLE", PyInt_FromLong(NPY_ARRAY_WRITEABLE));
-  PyDict_SetItemString(d, "UPDATEIFCOPY", PyInt_FromLong(NPY_ARRAY_UPDATEIFCOPY));
-  PyDict_SetItemString(d, "WRITEBACKIFCOPY", PyInt_FromLong(NPY_ARRAY_WRITEBACKIFCOPY));
-
-  PyDict_SetItemString(d, "BEHAVED", PyInt_FromLong(NPY_ARRAY_BEHAVED));
-  PyDict_SetItemString(d, "BEHAVED_NS", PyInt_FromLong(NPY_ARRAY_BEHAVED_NS));
-  PyDict_SetItemString(d, "CARRAY", PyInt_FromLong(NPY_ARRAY_CARRAY));
-  PyDict_SetItemString(d, "FARRAY", PyInt_FromLong(NPY_ARRAY_FARRAY));
-  PyDict_SetItemString(d, "CARRAY_RO", PyInt_FromLong(NPY_ARRAY_CARRAY_RO));
-  PyDict_SetItemString(d, "FARRAY_RO", PyInt_FromLong(NPY_ARRAY_FARRAY_RO));
-  PyDict_SetItemString(d, "DEFAULT", PyInt_FromLong(NPY_ARRAY_DEFAULT));
-  PyDict_SetItemString(d, "UPDATE_ALL", PyInt_FromLong(NPY_ARRAY_UPDATE_ALL));
+
+#define ADDCONST(NAME, CONST)              \
+    s = PyInt_FromLong(CONST);             \
+    PyDict_SetItemString(d, NAME, s);      \
+    Py_DECREF(s)
+
+  ADDCONST("F2PY_INTENT_IN", F2PY_INTENT_IN);
+  ADDCONST("F2PY_INTENT_INOUT", F2PY_INTENT_INOUT);
+  ADDCONST("F2PY_INTENT_OUT", F2PY_INTENT_OUT);
+  ADDCONST("F2PY_INTENT_HIDE", F2PY_INTENT_HIDE);
+  ADDCONST("F2PY_INTENT_CACHE", F2PY_INTENT_CACHE);
+  ADDCONST("F2PY_INTENT_COPY", F2PY_INTENT_COPY);
+  ADDCONST("F2PY_INTENT_C", F2PY_INTENT_C);
+  ADDCONST("F2PY_OPTIONAL", F2PY_OPTIONAL);
+  ADDCONST("F2PY_INTENT_INPLACE", F2PY_INTENT_INPLACE);
+  ADDCONST("NPY_BOOL", NPY_BOOL);
+  ADDCONST("NPY_BYTE", NPY_BYTE);
+  ADDCONST("NPY_UBYTE", NPY_UBYTE);
+  ADDCONST("NPY_SHORT", NPY_SHORT);
+  ADDCONST("NPY_USHORT", NPY_USHORT);
+  ADDCONST("NPY_INT", NPY_INT);
+  ADDCONST("NPY_UINT", NPY_UINT);
+  ADDCONST("NPY_INTP", NPY_INTP);
+  ADDCONST("NPY_UINTP", NPY_UINTP);
+  ADDCONST("NPY_LONG", NPY_LONG);
+  ADDCONST("NPY_ULONG", NPY_ULONG);
+  ADDCONST("NPY_LONGLONG", NPY_LONGLONG);
+  ADDCONST("NPY_ULONGLONG", NPY_ULONGLONG);
+  ADDCONST("NPY_FLOAT", NPY_FLOAT);
+  ADDCONST("NPY_DOUBLE", NPY_DOUBLE);
+  ADDCONST("NPY_LONGDOUBLE", NPY_LONGDOUBLE);
+  ADDCONST("NPY_CFLOAT", NPY_CFLOAT);
+  ADDCONST("NPY_CDOUBLE", NPY_CDOUBLE);
+  ADDCONST("NPY_CLONGDOUBLE", NPY_CLONGDOUBLE);
+  ADDCONST("NPY_OBJECT", NPY_OBJECT);
+  ADDCONST("NPY_STRING", NPY_STRING);
+  ADDCONST("NPY_UNICODE", NPY_UNICODE);
+  ADDCONST("NPY_VOID", NPY_VOID);
+  ADDCONST("NPY_NTYPES", NPY_NTYPES);
+  ADDCONST("NPY_NOTYPE", NPY_NOTYPE);
+  ADDCONST("NPY_USERDEF", NPY_USERDEF);
+
+  ADDCONST("CONTIGUOUS", NPY_ARRAY_C_CONTIGUOUS);
+  ADDCONST("FORTRAN", NPY_ARRAY_F_CONTIGUOUS);
+  ADDCONST("OWNDATA", NPY_ARRAY_OWNDATA);
+  ADDCONST("FORCECAST", NPY_ARRAY_FORCECAST);
+  ADDCONST("ENSURECOPY", NPY_ARRAY_ENSURECOPY);
+  ADDCONST("ENSUREARRAY", NPY_ARRAY_ENSUREARRAY);
+  ADDCONST("ALIGNED", NPY_ARRAY_ALIGNED);
+  ADDCONST("WRITEABLE", NPY_ARRAY_WRITEABLE);
+  ADDCONST("UPDATEIFCOPY", NPY_ARRAY_UPDATEIFCOPY);
+  ADDCONST("WRITEBACKIFCOPY", NPY_ARRAY_WRITEBACKIFCOPY);
+
+  ADDCONST("BEHAVED", NPY_ARRAY_BEHAVED);
+  ADDCONST("BEHAVED_NS", NPY_ARRAY_BEHAVED_NS);
+  ADDCONST("CARRAY", NPY_ARRAY_CARRAY);
+  ADDCONST("FARRAY", NPY_ARRAY_FARRAY);
+  ADDCONST("CARRAY_RO", NPY_ARRAY_CARRAY_RO);
+  ADDCONST("FARRAY_RO", NPY_ARRAY_FARRAY_RO);
+  ADDCONST("DEFAULT", NPY_ARRAY_DEFAULT);
+  ADDCONST("UPDATE_ALL", NPY_ARRAY_UPDATE_ALL);
+
+#undef ADDCONST(
 
   if (PyErr_Occurred())
     Py_FatalError("can't initialize module wrap");
diff --git a/numpy/f2py/tests/test_compile_function.py b/numpy/f2py/tests/test_compile_function.py
index 36abf05f9..40ea7997f 100644
--- a/numpy/f2py/tests/test_compile_function.py
+++ b/numpy/f2py/tests/test_compile_function.py
@@ -29,6 +29,7 @@ def setup_module():
 @pytest.mark.parametrize(
     "extra_args", [['--noopt', '--debug'], '--noopt --debug', '']
     )
+@pytest.mark.leaks_references(reason="Imported module seems never deleted.")
 def test_f2py_init_compile(extra_args):
     # flush through the f2py __init__ compile() function code path as a
     # crude test for input handling following migration from
@@ -81,6 +82,9 @@ def test_f2py_init_compile(extra_args):
             return_check = import_module(modname)
             calc_result = return_check.foo()
             assert_equal(calc_result, 15)
+            # Removal from sys.modules, is not as such necessary. Even with
+            # removal, the module (dict) stays alive.
+            del sys.modules[modname]
 
 
 def test_f2py_init_compile_failure():
diff --git a/numpy/f2py/tests/util.py b/numpy/f2py/tests/util.py
index d20dc5908..77cb612d0 100644
--- a/numpy/f2py/tests/util.py
+++ b/numpy/f2py/tests/util.py
@@ -31,6 +31,7 @@ except ImportError:
 #
 
 _module_dir = None
+_module_num = 5403
 
 
 def _cleanup():
@@ -59,13 +60,14 @@ def get_module_dir():
 
 def get_temp_module_name():
     # Assume single-threaded, and the module dir usable only by this thread
+    global _module_num
     d = get_module_dir()
-    for j in range(5403, 9999999):
-        name = "_test_ext_module_%d" % j
-        fn = os.path.join(d, name)
-        if name not in sys.modules and not os.path.isfile(fn + '.py'):
-            return name
-    raise RuntimeError("Failed to create a temporary module name")
+    name = "_test_ext_module_%d" % _module_num
+    _module_num += 1
+    if name in sys.modules:
+        # this should not be possible, but check anyway
+        raise RuntimeError("Temporary module name already in use.")
+    return name
 
 
 def _memoize(func):
diff --git a/numpy/lib/financial.py b/numpy/lib/financial.py
index d72384e99..3ac3a4c33 100644
--- a/numpy/lib/financial.py
+++ b/numpy/lib/financial.py
@@ -12,6 +12,7 @@ otherwise stated.
 """
 from __future__ import division, absolute_import, print_function
 
+import warnings
 from decimal import Decimal
 import functools
 
@@ -19,6 +20,10 @@ import numpy as np
 from numpy.core import overrides
 
 
+_depmsg = ("numpy.{name} is deprecated and will be removed from NumPy 1.20. "
+           "Use numpy_financial.{name} instead "
+           "(https://pypi.org/project/numpy-financial/).")
+
 array_function_dispatch = functools.partial(
     overrides.array_function_dispatch, module='numpy')
 
@@ -45,6 +50,8 @@ def _convert_when(when):
 
 
 def _fv_dispatcher(rate, nper, pmt, pv, when=None):
+    warnings.warn(_depmsg.format(name='fv'),
+                  DeprecationWarning, stacklevel=3)
     return (rate, nper, pmt, pv)
 
 
@@ -53,6 +60,15 @@ def fv(rate, nper, pmt, pv, when='end'):
     """
     Compute the future value.
 
+    .. deprecated:: 1.18
+
+       `fv` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     Given:
      * a present value, `pv`
      * an interest `rate` compounded once per period, of which
@@ -139,6 +155,8 @@ def fv(rate, nper, pmt, pv, when='end'):
 
 
 def _pmt_dispatcher(rate, nper, pv, fv=None, when=None):
+    warnings.warn(_depmsg.format(name='pmt'),
+                  DeprecationWarning, stacklevel=3)
     return (rate, nper, pv, fv)
 
 
@@ -147,6 +165,15 @@ def pmt(rate, nper, pv, fv=0, when='end'):
     """
     Compute the payment against loan principal plus interest.
 
+    .. deprecated:: 1.18
+
+       `pmt` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     Given:
      * a present value, `pv` (e.g., an amount borrowed)
      * a future value, `fv` (e.g., 0)
@@ -237,6 +264,8 @@ def pmt(rate, nper, pv, fv=0, when='end'):
 
 
 def _nper_dispatcher(rate, pmt, pv, fv=None, when=None):
+    warnings.warn(_depmsg.format(name='nper'),
+                  DeprecationWarning, stacklevel=3)
     return (rate, pmt, pv, fv)
 
 
@@ -245,6 +274,15 @@ def nper(rate, pmt, pv, fv=0, when='end'):
     """
     Compute the number of periodic payments.
 
+    .. deprecated:: 1.18
+
+       `nper` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     :class:`decimal.Decimal` type is not supported.
 
     Parameters
@@ -311,6 +349,8 @@ def nper(rate, pmt, pv, fv=0, when='end'):
 
 
 def _ipmt_dispatcher(rate, per, nper, pv, fv=None, when=None):
+    warnings.warn(_depmsg.format(name='ipmt'),
+                  DeprecationWarning, stacklevel=3)
     return (rate, per, nper, pv, fv)
 
 
@@ -319,6 +359,15 @@ def ipmt(rate, per, nper, pv, fv=0, when='end'):
     """
     Compute the interest portion of a payment.
 
+    .. deprecated:: 1.18
+
+       `ipmt` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     Parameters
     ----------
     rate : scalar or array_like of shape(M, )
@@ -422,6 +471,8 @@ def _rbl(rate, per, pmt, pv, when):
 
 
 def _ppmt_dispatcher(rate, per, nper, pv, fv=None, when=None):
+    warnings.warn(_depmsg.format(name='ppmt'),
+                  DeprecationWarning, stacklevel=3)
     return (rate, per, nper, pv, fv)
 
 
@@ -430,6 +481,15 @@ def ppmt(rate, per, nper, pv, fv=0, when='end'):
     """
     Compute the payment against loan principal.
 
+    .. deprecated:: 1.18
+
+       `ppmt` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     Parameters
     ----------
     rate : array_like
@@ -456,6 +516,8 @@ def ppmt(rate, per, nper, pv, fv=0, when='end'):
 
 
 def _pv_dispatcher(rate, nper, pmt, fv=None, when=None):
+    warnings.warn(_depmsg.format(name='pv'),
+                  DeprecationWarning, stacklevel=3)
     return (rate, nper, nper, pv, fv)
 
 
@@ -464,6 +526,15 @@ def pv(rate, nper, pmt, fv=0, when='end'):
     """
     Compute the present value.
 
+    .. deprecated:: 1.18
+
+       `pv` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     Given:
      * a future value, `fv`
      * an interest `rate` compounded once per period, of which
@@ -567,6 +638,8 @@ def _g_div_gp(r, n, p, x, y, w):
 
 def _rate_dispatcher(nper, pmt, pv, fv, when=None, guess=None, tol=None,
                      maxiter=None):
+    warnings.warn(_depmsg.format(name='rate'),
+                  DeprecationWarning, stacklevel=3)
     return (nper, pmt, pv, fv)
 
 
@@ -582,6 +655,15 @@ def rate(nper, pmt, pv, fv, when='end', guess=None, tol=None, maxiter=100):
     """
     Compute the rate of interest per period.
 
+    .. deprecated:: 1.18
+
+       `rate` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     Parameters
     ----------
     nper : array_like
@@ -651,6 +733,8 @@ def rate(nper, pmt, pv, fv, when='end', guess=None, tol=None, maxiter=100):
 
 
 def _irr_dispatcher(values):
+    warnings.warn(_depmsg.format(name='irr'),
+                  DeprecationWarning, stacklevel=3)
     return (values,)
 
 
@@ -659,6 +743,15 @@ def irr(values):
     """
     Return the Internal Rate of Return (IRR).
 
+    .. deprecated:: 1.18
+
+       `irr` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     This is the "average" periodically compounded rate of return
     that gives a net present value of 0.0; for a more complete explanation,
     see Notes below.
@@ -734,6 +827,8 @@ def irr(values):
 
 
 def _npv_dispatcher(rate, values):
+    warnings.warn(_depmsg.format(name='npv'),
+                  DeprecationWarning, stacklevel=3)
     return (values,)
 
 
@@ -742,6 +837,15 @@ def npv(rate, values):
     """
     Returns the NPV (Net Present Value) of a cash flow series.
 
+    .. deprecated:: 1.18
+
+       `npv` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     Parameters
     ----------
     rate : scalar
@@ -808,6 +912,8 @@ def npv(rate, values):
 
 
 def _mirr_dispatcher(values, finance_rate, reinvest_rate):
+    warnings.warn(_depmsg.format(name='mirr'),
+                  DeprecationWarning, stacklevel=3)
     return (values,)
 
 
@@ -816,6 +922,15 @@ def mirr(values, finance_rate, reinvest_rate):
     """
     Modified internal rate of return.
 
+    .. deprecated:: 1.18
+
+       `mirr` is deprecated; see NEP 32::
+
+           https://numpy.org/neps/nep-0032-remove-financial-functions.html
+
+        Use the corresponding function in the numpy-financial library,
+        https://pypi.org/project/numpy-financial
+
     Parameters
     ----------
     values : array_like
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index a81b1c816..3ad630a7d 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -1314,9 +1314,13 @@ def interp(x, xp, fp, left=None, right=None, period=None):
 
     Notes
     -----
-    Does not check that the x-coordinate sequence `xp` is increasing.
-    If `xp` is not increasing, the results are nonsense.
-    A simple check for increasing is::
+    The x-coordinate sequence is expected to be increasing, but this is not
+    explicitly enforced.  However, if the sequence `xp` is non-increasing,
+    interpolation results are meaningless.
+
+    Note that, since NaN is unsortable, `xp` also cannot contain NaNs.
+
+    A simple check for `xp` being strictly increasing is::
 
         np.all(np.diff(xp) > 0)
 
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 8474bd5d3..03c365ab6 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -22,6 +22,16 @@ array_function_dispatch = functools.partial(
 _range = range
 
 
+def _ptp(x):
+    """Peak-to-peak value of x.
+
+    This implementation avoids the problem of signed integer arrays having a
+    peak-to-peak value that cannot be represented with the array's data type.
+    This function returns an unsigned value for signed integer arrays.
+    """
+    return _unsigned_subtract(x.max(), x.min())
+
+
 def _hist_bin_sqrt(x, range):
     """
     Square root histogram bin estimator.
@@ -40,7 +50,7 @@ def _hist_bin_sqrt(x, range):
     h : An estimate of the optimal bin width for the given data.
     """
     del range  # unused
-    return x.ptp() / np.sqrt(x.size)
+    return _ptp(x) / np.sqrt(x.size)
 
 
 def _hist_bin_sturges(x, range):
@@ -63,7 +73,7 @@ def _hist_bin_sturges(x, range):
     h : An estimate of the optimal bin width for the given data.
     """
     del range  # unused
-    return x.ptp() / (np.log2(x.size) + 1.0)
+    return _ptp(x) / (np.log2(x.size) + 1.0)
 
 
 def _hist_bin_rice(x, range):
@@ -87,7 +97,7 @@ def _hist_bin_rice(x, range):
     h : An estimate of the optimal bin width for the given data.
     """
     del range  # unused
-    return x.ptp() / (2.0 * x.size ** (1.0 / 3))
+    return _ptp(x) / (2.0 * x.size ** (1.0 / 3))
 
 
 def _hist_bin_scott(x, range):
@@ -137,7 +147,7 @@ def _hist_bin_stone(x, range):
     """
 
     n = x.size
-    ptp_x = np.ptp(x)
+    ptp_x = _ptp(x)
     if n <= 1 or ptp_x == 0:
         return 0
 
@@ -184,7 +194,7 @@ def _hist_bin_doane(x, range):
             np.true_divide(temp, sigma, temp)
             np.power(temp, 3, temp)
             g1 = np.mean(temp)
-            return x.ptp() / (1.0 + np.log2(x.size) +
+            return _ptp(x) / (1.0 + np.log2(x.size) +
                                     np.log2(1.0 + np.absolute(g1) / sg1))
     return 0.0
 
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index e57a6dd47..7e1d4db4f 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -480,7 +480,7 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
     file : file, str, or pathlib.Path
         File or filename to which the data is saved.  If file is a file-object,
         then the filename is unchanged.  If file is a string or Path, a ``.npy``
-        extension will be appended to the file name if it does not already
+        extension will be appended to the filename if it does not already
         have one.
     arr : array_like
         Array data to be saved.
@@ -506,9 +506,9 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
     Notes
     -----
     For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
-        
-    Any data saved to the file is appended to the end of the file. 
-    
+
+    Any data saved to the file is appended to the end of the file.
+
     Examples
     --------
     >>> from tempfile import TemporaryFile
@@ -524,7 +524,7 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
 
     >>> with open('test.npy', 'wb') as f:
     ...     np.save(f, np.array([1, 2]))
-    ...     np.save(f, np.array([1, 3]))    
+    ...     np.save(f, np.array([1, 3]))
     >>> with open('test.npy', 'rb') as f:
     ...     a = np.load(f)
     ...     b = np.load(f)
@@ -565,8 +565,7 @@ def _savez_dispatcher(file, *args, **kwds):
 
 @array_function_dispatch(_savez_dispatcher)
 def savez(file, *args, **kwds):
-    """
-    Save several arrays into a single file in uncompressed ``.npz`` format.
+    """Save several arrays into a single file in uncompressed ``.npz`` format.
 
     If arguments are passed in with no keywords, the corresponding variable
     names, in the ``.npz`` file, are 'arr_0', 'arr_1', etc. If keyword
@@ -576,9 +575,9 @@ def savez(file, *args, **kwds):
     Parameters
     ----------
     file : str or file
-        Either the file name (string) or an open file (file-like object)
+        Either the filename (string) or an open file (file-like object)
         where the data will be saved. If file is a string or a Path, the
-        ``.npz`` extension will be appended to the file name if it is not
+        ``.npz`` extension will be appended to the filename if it is not
         already there.
     args : Arguments, optional
         Arrays to save to the file. Since it is not possible for Python to
@@ -611,6 +610,10 @@ def savez(file, *args, **kwds):
     its list of arrays (with the ``.files`` attribute), and for the arrays
     themselves.
 
+    When saving dictionaries, the dictionary keys become filenames
+    inside the ZIP archive. Therefore, keys should be valid filenames.
+    E.g., avoid keys that begin with ``/`` or contain ``.``.
+
     Examples
     --------
     >>> from tempfile import TemporaryFile
@@ -638,7 +641,6 @@ def savez(file, *args, **kwds):
     ['x', 'y']
     >>> npzfile['x']
     array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-
     """
     _savez(file, args, kwds, False)
 
@@ -656,15 +658,15 @@ def savez_compressed(file, *args, **kwds):
     Save several arrays into a single file in compressed ``.npz`` format.
 
     If keyword arguments are given, then filenames are taken from the keywords.
-    If arguments are passed in with no keywords, then stored file names are
+    If arguments are passed in with no keywords, then stored filenames are
     arr_0, arr_1, etc.
 
     Parameters
     ----------
     file : str or file
-        Either the file name (string) or an open file (file-like object)
+        Either the filename (string) or an open file (file-like object)
         where the data will be saved. If file is a string or a Path, the
-        ``.npz`` extension will be appended to the file name if it is not
+        ``.npz`` extension will be appended to the filename if it is not
         already there.
     args : Arguments, optional
         Arrays to save to the file. Since it is not possible for Python to
@@ -1469,7 +1471,7 @@ def fromregex(file, regexp, dtype, encoding=None):
     Parameters
     ----------
     file : str or file
-        File name or file object to read.
+        Filename or file object to read.
     regexp : str or regexp
         Regular expression used to parse the file.
         Groups in the regular expression correspond to fields in the dtype.
diff --git a/numpy/lib/tests/test_financial.py b/numpy/lib/tests/test_financial.py
index 21088765f..cb67f7c0f 100644
--- a/numpy/lib/tests/test_financial.py
+++ b/numpy/lib/tests/test_financial.py
@@ -1,5 +1,6 @@
 from __future__ import division, absolute_import, print_function
 
+import warnings
 from decimal import Decimal
 
 import numpy as np
@@ -8,22 +9,35 @@ from numpy.testing import (
     )
 
 
+def filter_deprecation(func):
+    def newfunc(*args, **kwargs):
+        with warnings.catch_warnings(record=True) as ws:
+            warnings.filterwarnings('always', category=DeprecationWarning)
+            func(*args, **kwargs)
+            assert_(all(w.category is DeprecationWarning for w in ws))
+    return newfunc
+
+
 class TestFinancial(object):
+    @filter_deprecation
     def test_npv_irr_congruence(self):
         # IRR is defined as the rate required for the present value of a
         # a series of cashflows to be zero i.e. NPV(IRR(x), x) = 0
         cashflows = np.array([-40000, 5000, 8000, 12000, 30000])
         assert_allclose(np.npv(np.irr(cashflows), cashflows), 0, atol=1e-10, rtol=0)
 
+    @filter_deprecation
     def test_rate(self):
         assert_almost_equal(
             np.rate(10, 0, -3500, 10000),
             0.1107, 4)
 
+    @filter_deprecation
     def test_rate_decimal(self):
         rate = np.rate(Decimal('10'), Decimal('0'), Decimal('-3500'), Decimal('10000'))
         assert_equal(Decimal('0.1106908537142689284704528100'), rate)
 
+    @filter_deprecation
     def test_irr(self):
         v = [-150000, 15000, 25000, 35000, 45000, 60000]
         assert_almost_equal(np.irr(v), 0.0524, 2)
@@ -43,20 +57,25 @@ class TestFinancial(object):
         v = [-1, -2, -3]
         assert_equal(np.irr(v), np.nan)
 
+    @filter_deprecation
     def test_pv(self):
         assert_almost_equal(np.pv(0.07, 20, 12000, 0), -127128.17, 2)
 
+    @filter_deprecation
     def test_pv_decimal(self):
         assert_equal(np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0')),
                      Decimal('-127128.1709461939327295222005'))
 
+    @filter_deprecation
     def test_fv(self):
         assert_equal(np.fv(0.075, 20, -2000, 0, 0), 86609.362673042924)
 
+    @filter_deprecation
     def test_fv_decimal(self):
         assert_equal(np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), 0, 0),
                      Decimal('86609.36267304300040536731624'))
 
+    @filter_deprecation
     def test_pmt(self):
         res = np.pmt(0.08 / 12, 5 * 12, 15000)
         tgt = -304.145914
@@ -71,6 +90,7 @@ class TestFinancial(object):
         tgt = np.array([[-166.66667, -19311.258], [-626.90814, -19311.258]])
         assert_allclose(res, tgt)
 
+    @filter_deprecation
     def test_pmt_decimal(self):
         res = np.pmt(Decimal('0.08') / Decimal('12'), 5 * 12, 15000)
         tgt = Decimal('-304.1459143262052370338701494')
@@ -94,18 +114,22 @@ class TestFinancial(object):
         assert_equal(res[1][0], tgt[1][0])
         assert_equal(res[1][1], tgt[1][1])
 
+    @filter_deprecation
     def test_ppmt(self):
         assert_equal(np.round(np.ppmt(0.1 / 12, 1, 60, 55000), 2), -710.25)
 
+    @filter_deprecation
     def test_ppmt_decimal(self):
         assert_equal(np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000')),
                      Decimal('-710.2541257864217612489830917'))
 
     # Two tests showing how Decimal is actually getting at a more exact result
     # .23 / 12 does not come out nicely as a float but does as a decimal
+    @filter_deprecation
     def test_ppmt_special_rate(self):
         assert_equal(np.round(np.ppmt(0.23 / 12, 1, 60, 10000000000), 8), -90238044.232277036)
 
+    @filter_deprecation
     def test_ppmt_special_rate_decimal(self):
         # When rounded out to 8 decimal places like the float based test, this should not equal the same value
         # as the float, substituted for the decimal
@@ -118,31 +142,38 @@ class TestFinancial(object):
         assert_equal(np.ppmt(Decimal('0.23') / Decimal('12'), 1, 60, Decimal('10000000000')),
                      Decimal('-90238044.2322778884413969909'))
 
+    @filter_deprecation
     def test_ipmt(self):
         assert_almost_equal(np.round(np.ipmt(0.1 / 12, 1, 24, 2000), 2), -16.67)
 
+    @filter_deprecation
     def test_ipmt_decimal(self):
         result = np.ipmt(Decimal('0.1') / Decimal('12'), 1, 24, 2000)
         assert_equal(result.flat[0], Decimal('-16.66666666666666666666666667'))
 
+    @filter_deprecation
     def test_nper(self):
         assert_almost_equal(np.nper(0.075, -2000, 0, 100000.),
                             21.54, 2)
 
+    @filter_deprecation
     def test_nper2(self):
         assert_almost_equal(np.nper(0.0, -2000, 0, 100000.),
                             50.0, 1)
 
+    @filter_deprecation
     def test_npv(self):
         assert_almost_equal(
             np.npv(0.05, [-15000, 1500, 2500, 3500, 4500, 6000]),
             122.89, 2)
 
+    @filter_deprecation
     def test_npv_decimal(self):
         assert_equal(
             np.npv(Decimal('0.05'), [-15000, 1500, 2500, 3500, 4500, 6000]),
             Decimal('122.894854950942692161628715'))
 
+    @filter_deprecation
     def test_mirr(self):
         val = [-4500, -800, 800, 800, 600, 600, 800, 800, 700, 3000]
         assert_almost_equal(np.mirr(val, 0.08, 0.055), 0.0666, 4)
@@ -156,6 +187,7 @@ class TestFinancial(object):
         val = [39000, 30000, 21000, 37000, 46000]
         assert_(np.isnan(np.mirr(val, 0.10, 0.12)))
 
+    @filter_deprecation
     def test_mirr_decimal(self):
         val = [Decimal('-4500'), Decimal('-800'), Decimal('800'), Decimal('800'),
                Decimal('600'), Decimal('600'), Decimal('800'), Decimal('800'),
@@ -174,6 +206,7 @@ class TestFinancial(object):
         val = [Decimal('39000'), Decimal('30000'), Decimal('21000'), Decimal('37000'), Decimal('46000')]
         assert_(np.isnan(np.mirr(val, Decimal('0.10'), Decimal('0.12'))))
 
+    @filter_deprecation
     def test_when(self):
         # begin
         assert_equal(np.rate(10, 20, -3500, 10000, 1),
@@ -238,6 +271,7 @@ class TestFinancial(object):
         assert_equal(np.nper(0.075, -2000, 0, 100000., 0),
                      np.nper(0.075, -2000, 0, 100000., 'end'))
 
+    @filter_deprecation
     def test_decimal_with_when(self):
         """Test that decimals are still supported if the when argument is passed"""
         # begin
@@ -312,6 +346,7 @@ class TestFinancial(object):
                      np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
                              Decimal('0'), 'end').flat[0])
 
+    @filter_deprecation
     def test_broadcast(self):
         assert_almost_equal(np.nper(0.075, -2000, 0, 100000., [0, 1]),
                             [21.5449442, 20.76156441], 4)
@@ -329,6 +364,7 @@ class TestFinancial(object):
                             [-74.998201, -75.62318601, -75.62318601,
                              -76.88882405, -76.88882405], 4)
 
+    @filter_deprecation
     def test_broadcast_decimal(self):
         # Use almost equal because precision is tested in the explicit tests, this test is to ensure
         # broadcast with Decimal is not broken.
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 4895a722c..dbf189f3e 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -8,6 +8,7 @@ from numpy.testing import (
     assert_array_almost_equal, assert_raises, assert_allclose,
     assert_array_max_ulp, assert_raises_regex, suppress_warnings,
     )
+import pytest
 
 
 class TestHistogram(object):
@@ -591,6 +592,16 @@ class TestHistogramOptimBinNums(object):
                 msg += " with datasize of {0}".format(testlen)
                 assert_equal(len(a), numbins, err_msg=msg)
 
+    @pytest.mark.parametrize("bins", ['auto', 'fd', 'doane', 'scott',
+                                      'stone', 'rice', 'sturges'])
+    def test_signed_integer_data(self, bins):
+        # Regression test for gh-14379.
+        a = np.array([-2, 0, 127], dtype=np.int8)
+        hist, edges = np.histogram(a, bins=bins)
+        hist32, edges32 = np.histogram(a.astype(np.int32), bins=bins)
+        assert_array_equal(hist, hist32)
+        assert_array_equal(edges, edges32)
+
     def test_simple_weighted(self):
         """
         Check that weighted data raises a TypeError
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 6ee17c830..1181fe986 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -1871,7 +1871,7 @@ M   33  21.99
         data = ["1, 1, 1, 1, -1.1"] * 50
         mdata = TextIO("\n".join(data))
 
-        converters = {4: lambda x: "(%s)" % x}
+        converters = {4: lambda x: "(%s)" % x.decode()}
         kwargs = dict(delimiter=",", converters=converters,
                       dtype=[(_, int) for _ in 'abcde'],)
         assert_raises(ValueError, np.genfromtxt, mdata, **kwargs)
diff --git a/numpy/matlib.py b/numpy/matlib.py
index 604ef470b..b1b155586 100644
--- a/numpy/matlib.py
+++ b/numpy/matlib.py
@@ -239,7 +239,7 @@ def rand(*args):
 
     See Also
     --------
-    randn, numpy.random.rand
+    randn, numpy.random.RandomState.rand
 
     Examples
     --------
@@ -285,7 +285,7 @@ def randn(*args):
 
     See Also
     --------
-    rand, random.randn
+    rand, numpy.random.RandomState.randn
 
     Notes
     -----
diff --git a/numpy/random/__init__.py b/numpy/random/__init__.py
index f7c248451..1ceb5c4dd 100644
--- a/numpy/random/__init__.py
+++ b/numpy/random/__init__.py
@@ -179,20 +179,19 @@ __all__ = [
 
 # add these for module-freeze analysis (like PyInstaller)
 from . import _pickle
-from . import common
-from . import bounded_integers
-
+from . import _common
+from . import _bounded_integers
+
+from ._generator import Generator, default_rng
+from ._bit_generator import SeedSequence, BitGenerator
+from ._mt19937 import MT19937
+from ._pcg64 import PCG64
+from ._philox import Philox
+from ._sfc64 import SFC64
 from .mtrand import *
-from .generator import Generator, default_rng
-from .bit_generator import SeedSequence
-from .mt19937 import MT19937
-from .pcg64 import PCG64
-from .philox import Philox
-from .sfc64 import SFC64
-from .mtrand import RandomState
 
 __all__ += ['Generator', 'RandomState', 'SeedSequence', 'MT19937',
-            'Philox', 'PCG64', 'SFC64', 'default_rng']
+            'Philox', 'PCG64', 'SFC64', 'default_rng', 'BitGenerator']
 
 
 def __RandomState_ctor():
diff --git a/numpy/random/bit_generator.pxd b/numpy/random/_bit_generator.pxd
index 984033f17..30fa4a27d 100644
--- a/numpy/random/bit_generator.pxd
+++ b/numpy/random/_bit_generator.pxd
@@ -1,6 +1,15 @@
-
-from .common cimport bitgen_t, uint32_t
 cimport numpy as np
+from libc.stdint cimport uint32_t, uint64_t
+
+cdef extern from "include/bitgen.h":
+    struct bitgen:
+        void *state
+        uint64_t (*next_uint64)(void *st) nogil
+        uint32_t (*next_uint32)(void *st) nogil
+        double (*next_double)(void *st) nogil
+        uint64_t (*next_raw)(void *st) nogil
+
+    ctypedef bitgen bitgen_t
 
 cdef class BitGenerator():
     cdef readonly object _seed_seq
diff --git a/numpy/random/bit_generator.pyx b/numpy/random/_bit_generator.pyx
index eb608af6c..21d21e6bb 100644
--- a/numpy/random/bit_generator.pyx
+++ b/numpy/random/_bit_generator.pyx
@@ -53,9 +53,7 @@ from cpython.pycapsule cimport PyCapsule_New
 import numpy as np
 cimport numpy as np
 
-from libc.stdint cimport uint32_t
-from .common cimport (random_raw, benchmark, prepare_ctypes, prepare_cffi)
-from .distributions cimport bitgen_t
+from ._common cimport (random_raw, benchmark, prepare_ctypes, prepare_cffi)
 
 __all__ = ['SeedSequence', 'BitGenerator']
 
@@ -116,7 +114,7 @@ def _coerce_to_uint32_array(x):
     Examples
     --------
     >>> import numpy as np
-    >>> from numpy.random.bit_generator import _coerce_to_uint32_array
+    >>> from numpy.random._bit_generator import _coerce_to_uint32_array
     >>> _coerce_to_uint32_array(12345)
     array([12345], dtype=uint32)
     >>> _coerce_to_uint32_array('12345')
@@ -484,13 +482,12 @@ cdef class BitGenerator():
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
-        `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        ~`numpy.random.SeedSequence` to derive the initial `BitGenerator` state.
+        One may also pass in a `SeedSequence` instance.
 
     Attributes
     ----------
diff --git a/numpy/random/_bounded_integers.pxd b/numpy/random/_bounded_integers.pxd
new file mode 100644
index 000000000..d3ee97a70
--- /dev/null
+++ b/numpy/random/_bounded_integers.pxd
@@ -0,0 +1,29 @@
+from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t,
+                          int8_t, int16_t, int32_t, int64_t, intptr_t)
+import numpy as np
+cimport numpy as np
+ctypedef np.npy_bool bool_t
+
+from ._bit_generator cimport bitgen_t
+
+cdef inline uint64_t _gen_mask(uint64_t max_val) nogil:
+    """Mask generator for use in bounded random numbers"""
+    # Smallest bit mask >= max
+    cdef uint64_t mask = max_val
+    mask |= mask >> 1
+    mask |= mask >> 2
+    mask |= mask >> 4
+    mask |= mask >> 8
+    mask |= mask >> 16
+    mask |= mask >> 32
+    return mask
+
+cdef object _rand_uint64(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_uint32(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_uint16(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_uint8(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_bool(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_int64(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_int32(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_int16(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_int8(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
diff --git a/numpy/random/bounded_integers.pxd.in b/numpy/random/_bounded_integers.pxd.in
index 7a3f224dc..320d35774 100644
--- a/numpy/random/bounded_integers.pxd.in
+++ b/numpy/random/_bounded_integers.pxd.in
@@ -4,7 +4,7 @@ import numpy as np
 cimport numpy as np
 ctypedef np.npy_bool bool_t
 
-from .common cimport bitgen_t
+from ._bit_generator cimport bitgen_t
 
 cdef inline uint64_t _gen_mask(uint64_t max_val) nogil:
     """Mask generator for use in bounded random numbers"""
diff --git a/numpy/random/_bounded_integers.pyx b/numpy/random/_bounded_integers.pyx
new file mode 100644
index 000000000..d6a534b43
--- /dev/null
+++ b/numpy/random/_bounded_integers.pyx
@@ -0,0 +1,1564 @@
+#!python
+#cython: wraparound=False, nonecheck=False, boundscheck=False, cdivision=True
+
+import numpy as np
+cimport numpy as np
+
+__all__ = []
+
+np.import_array()
+
+cdef extern from "include/distributions.h":
+    # Generate random numbers in closed interval [off, off + rng].
+    uint64_t random_bounded_uint64(bitgen_t *bitgen_state,
+                                   uint64_t off, uint64_t rng,
+                                   uint64_t mask, bint use_masked) nogil
+    uint32_t random_buffered_bounded_uint32(bitgen_t *bitgen_state,
+                                            uint32_t off, uint32_t rng,
+                                            uint32_t mask, bint use_masked,
+                                            int *bcnt, uint32_t *buf) nogil
+    uint16_t random_buffered_bounded_uint16(bitgen_t *bitgen_state,
+                                            uint16_t off, uint16_t rng,
+                                            uint16_t mask, bint use_masked,
+                                            int *bcnt, uint32_t *buf) nogil
+    uint8_t random_buffered_bounded_uint8(bitgen_t *bitgen_state,
+                                          uint8_t off, uint8_t rng,
+                                          uint8_t mask, bint use_masked,
+                                          int *bcnt, uint32_t *buf) nogil
+    np.npy_bool random_buffered_bounded_bool(bitgen_t *bitgen_state,
+                                             np.npy_bool off, np.npy_bool rng,
+                                             np.npy_bool mask, bint use_masked,
+                                             int *bcnt, uint32_t *buf) nogil
+    void random_bounded_uint64_fill(bitgen_t *bitgen_state,
+                                    uint64_t off, uint64_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint64_t *out) nogil
+    void random_bounded_uint32_fill(bitgen_t *bitgen_state,
+                                    uint32_t off, uint32_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint32_t *out) nogil
+    void random_bounded_uint16_fill(bitgen_t *bitgen_state,
+                                    uint16_t off, uint16_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint16_t *out) nogil
+    void random_bounded_uint8_fill(bitgen_t *bitgen_state,
+                                   uint8_t off, uint8_t rng, np.npy_intp cnt,
+                                   bint use_masked,
+                                   uint8_t *out) nogil
+    void random_bounded_bool_fill(bitgen_t *bitgen_state,
+                                  np.npy_bool off, np.npy_bool rng, np.npy_intp cnt,
+                                  bint use_masked,
+                                  np.npy_bool *out) nogil
+
+
+
+_integers_types = {'bool': (0, 2),
+                 'int8': (-2**7, 2**7),
+                 'int16': (-2**15, 2**15),
+                 'int32': (-2**31, 2**31),
+                 'int64': (-2**63, 2**63),
+                 'uint8': (0, 2**8),
+                 'uint16': (0, 2**16),
+                 'uint32': (0, 2**32),
+                 'uint64': (0, 2**64)}
+
+
+cdef object _rand_uint32_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint64. Here we case to
+    this type for checking and the recast to uint32 when producing the
+    random integers.
+    """
+    cdef uint32_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint32_t *out_data
+    cdef uint64_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, 0)):
+        raise ValueError('low is out of bounds for uint32')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0X100000000ULL)):
+        raise ValueError('high is out of bounds for uint32')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_UINT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.uint32)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.uint32)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint32_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint32_t>((high_v - is_open) - low_v)
+            off = <uint32_t>(<uint64_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint32_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint32(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_uint16_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint32. Here we case to
+    this type for checking and the recast to uint16 when producing the
+    random integers.
+    """
+    cdef uint16_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint16_t *out_data
+    cdef uint32_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, 0)):
+        raise ValueError('low is out of bounds for uint16')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0X10000UL)):
+        raise ValueError('high is out of bounds for uint16')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT32, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_UINT32, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.uint16)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.uint16)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint16_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint32_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint32_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint16_t>((high_v - is_open) - low_v)
+            off = <uint16_t>(<uint32_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint16_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint16(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_uint8_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint16. Here we case to
+    this type for checking and the recast to uint8 when producing the
+    random integers.
+    """
+    cdef uint8_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint8_t *out_data
+    cdef uint16_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, 0)):
+        raise ValueError('low is out of bounds for uint8')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0X100UL)):
+        raise ValueError('high is out of bounds for uint8')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT16, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_UINT16, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.uint8)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.uint8)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint8_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint16_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint16_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint8_t>((high_v - is_open) - low_v)
+            off = <uint8_t>(<uint16_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint8_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint8(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_bool_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint8. Here we case to
+    this type for checking and the recast to bool when producing the
+    random integers.
+    """
+    cdef bool_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef bool_t *out_data
+    cdef uint8_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, 0)):
+        raise ValueError('low is out of bounds for bool')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0x2UL)):
+        raise ValueError('high is out of bounds for bool')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT8, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_UINT8, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.bool_)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.bool_)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <bool_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint8_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint8_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <bool_t>((high_v - is_open) - low_v)
+            off = <bool_t>(<uint8_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <bool_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_bool(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_int32_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint64. Here we case to
+    this type for checking and the recast to int32 when producing the
+    random integers.
+    """
+    cdef uint32_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint32_t *out_data
+    cdef uint64_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, -0x80000000LL)):
+        raise ValueError('low is out of bounds for int32')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0x80000000LL)):
+        raise ValueError('high is out of bounds for int32')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_INT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_INT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.int32)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.int32)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint32_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint32_t>((high_v - is_open) - low_v)
+            off = <uint32_t>(<uint64_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint32_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint32(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_int16_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint32. Here we case to
+    this type for checking and the recast to int16 when producing the
+    random integers.
+    """
+    cdef uint16_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint16_t *out_data
+    cdef uint32_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, -0x8000LL)):
+        raise ValueError('low is out of bounds for int16')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0x8000LL)):
+        raise ValueError('high is out of bounds for int16')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_INT32, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_INT32, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.int16)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.int16)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint16_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint32_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint32_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint16_t>((high_v - is_open) - low_v)
+            off = <uint16_t>(<uint32_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint16_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint16(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_int8_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint16. Here we case to
+    this type for checking and the recast to int8 when producing the
+    random integers.
+    """
+    cdef uint8_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint8_t *out_data
+    cdef uint16_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, -0x80LL)):
+        raise ValueError('low is out of bounds for int8')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0x80LL)):
+        raise ValueError('high is out of bounds for int8')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_INT16, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_INT16, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.int8)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.int8)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint8_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint16_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint16_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint8_t>((high_v - is_open) - low_v)
+            off = <uint8_t>(<uint16_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint8_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint8(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+
+cdef object _rand_uint64_broadcast(object low, object high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for 64-bit integer types
+
+    Requires special treatment since the high value can be out-of-range for
+    the largest (64 bit) integer type since the generator is specified on the
+    interval [low,high).
+
+    The internal generator does not have this issue since it generates from
+    the closes interval [low, high-1] and high-1 is always in range for the
+    64 bit integer type.
+    """
+
+    cdef np.ndarray low_arr, high_arr, out_arr, highm1_arr
+    cdef np.npy_intp i, cnt, n
+    cdef np.broadcast it
+    cdef object closed_upper
+    cdef uint64_t *out_data
+    cdef uint64_t *highm1_data
+    cdef uint64_t low_v, high_v
+    cdef uint64_t rng, last_rng, val, mask, off, out_val
+
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+
+    if np.any(np.less(low_arr, 0x0ULL)):
+        raise ValueError('low is out of bounds for uint64')
+    dt = high_arr.dtype
+    if closed or np.issubdtype(dt, np.integer):
+        # Avoid object dtype path if already an integer
+        high_lower_comp = np.less if closed else np.less_equal
+        if np.any(high_lower_comp(high_arr, 0x0ULL)):
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+        high_m1 = high_arr if closed else high_arr - dt.type(1)
+        if np.any(np.greater(high_m1, 0xFFFFFFFFFFFFFFFFULL)):
+            raise ValueError('high is out of bounds for uint64')
+        highm1_arr = <np.ndarray>np.PyArray_FROM_OTF(high_m1, np.NPY_UINT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    else:
+        # If input is object or a floating type
+        highm1_arr = <np.ndarray>np.empty_like(high_arr, dtype=np.uint64)
+        highm1_data = <uint64_t *>np.PyArray_DATA(highm1_arr)
+        cnt = np.PyArray_SIZE(high_arr)
+        flat = high_arr.flat
+        for i in range(cnt):
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            closed_upper = int(flat[i]) - 1
+            if closed_upper > 0xFFFFFFFFFFFFFFFFULL:
+                raise ValueError('high is out of bounds for uint64')
+            if closed_upper < 0x0ULL:
+                comp = '>' if closed else '>='
+                raise ValueError('low {comp} high'.format(comp=comp))
+            highm1_data[i] = <uint64_t>closed_upper
+
+    if np.any(np.greater(low_arr, highm1_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    high_arr = highm1_arr
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.uint64)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.uint64)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint64_t *>np.PyArray_DATA(out_arr)
+    n = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(n):
+            low_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Generator produces values on the closed int [off, off+rng], -1 subtracted above
+            rng = <uint64_t>(high_v - low_v)
+            off = <uint64_t>(<uint64_t>low_v)
+
+            if rng != last_rng:
+                mask = _gen_mask(rng)
+            out_data[i] = random_bounded_uint64(state, off, rng, mask, use_masked)
+
+            np.PyArray_MultiIter_NEXT(it)
+
+    return out_arr
+
+cdef object _rand_int64_broadcast(object low, object high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for 64-bit integer types
+
+    Requires special treatment since the high value can be out-of-range for
+    the largest (64 bit) integer type since the generator is specified on the
+    interval [low,high).
+
+    The internal generator does not have this issue since it generates from
+    the closes interval [low, high-1] and high-1 is always in range for the
+    64 bit integer type.
+    """
+
+    cdef np.ndarray low_arr, high_arr, out_arr, highm1_arr
+    cdef np.npy_intp i, cnt, n
+    cdef np.broadcast it
+    cdef object closed_upper
+    cdef uint64_t *out_data
+    cdef int64_t *highm1_data
+    cdef int64_t low_v, high_v
+    cdef uint64_t rng, last_rng, val, mask, off, out_val
+
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+
+    if np.any(np.less(low_arr, -0x8000000000000000LL)):
+        raise ValueError('low is out of bounds for int64')
+    dt = high_arr.dtype
+    if closed or np.issubdtype(dt, np.integer):
+        # Avoid object dtype path if already an integer
+        high_lower_comp = np.less if closed else np.less_equal
+        if np.any(high_lower_comp(high_arr, -0x8000000000000000LL)):
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+        high_m1 = high_arr if closed else high_arr - dt.type(1)
+        if np.any(np.greater(high_m1, 0x7FFFFFFFFFFFFFFFLL)):
+            raise ValueError('high is out of bounds for int64')
+        highm1_arr = <np.ndarray>np.PyArray_FROM_OTF(high_m1, np.NPY_INT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    else:
+        # If input is object or a floating type
+        highm1_arr = <np.ndarray>np.empty_like(high_arr, dtype=np.int64)
+        highm1_data = <int64_t *>np.PyArray_DATA(highm1_arr)
+        cnt = np.PyArray_SIZE(high_arr)
+        flat = high_arr.flat
+        for i in range(cnt):
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            closed_upper = int(flat[i]) - 1
+            if closed_upper > 0x7FFFFFFFFFFFFFFFLL:
+                raise ValueError('high is out of bounds for int64')
+            if closed_upper < -0x8000000000000000LL:
+                comp = '>' if closed else '>='
+                raise ValueError('low {comp} high'.format(comp=comp))
+            highm1_data[i] = <int64_t>closed_upper
+
+    if np.any(np.greater(low_arr, highm1_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    high_arr = highm1_arr
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_INT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.int64)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.int64)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint64_t *>np.PyArray_DATA(out_arr)
+    n = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(n):
+            low_v = (<int64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<int64_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Generator produces values on the closed int [off, off+rng], -1 subtracted above
+            rng = <uint64_t>(high_v - low_v)
+            off = <uint64_t>(<int64_t>low_v)
+
+            if rng != last_rng:
+                mask = _gen_mask(rng)
+            out_data[i] = random_bounded_uint64(state, off, rng, mask, use_masked)
+
+            np.PyArray_MultiIter_NEXT(it)
+
+    return out_arr
+
+
+cdef object _rand_uint64(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_uint64(low, high, size, use_masked, *state, lock)
+
+    Return random np.uint64 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.uint64 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.uint64
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint64. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint64_t rng, off, out_val
+    cdef uint64_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.uint64)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0ULL:
+            raise ValueError("low is out of bounds for uint64")
+        if high > 0xFFFFFFFFFFFFFFFFULL:
+            raise ValueError("high is out of bounds for uint64")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint64_t>(high - low)
+        off = <uint64_t>(<uint64_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint64_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.uint64(<uint64_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.uint64)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint64_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint64_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_uint64_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_uint32(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_uint32(low, high, size, use_masked, *state, lock)
+
+    Return random np.uint32 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.uint32 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.uint32
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint32. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint32_t rng, off, out_val
+    cdef uint32_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.uint32)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0UL:
+            raise ValueError("low is out of bounds for uint32")
+        if high > 0XFFFFFFFFUL:
+            raise ValueError("high is out of bounds for uint32")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint32_t>(high - low)
+        off = <uint32_t>(<uint32_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint32_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.uint32(<uint32_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.uint32)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint32_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint32_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_uint32_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_uint16(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_uint16(low, high, size, use_masked, *state, lock)
+
+    Return random np.uint16 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.uint16 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.uint16
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint16. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint16_t rng, off, out_val
+    cdef uint16_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.uint16)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0UL:
+            raise ValueError("low is out of bounds for uint16")
+        if high > 0XFFFFUL:
+            raise ValueError("high is out of bounds for uint16")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint16_t>(high - low)
+        off = <uint16_t>(<uint16_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint16_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.uint16(<uint16_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.uint16)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint16_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint16_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_uint16_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_uint8(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_uint8(low, high, size, use_masked, *state, lock)
+
+    Return random np.uint8 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.uint8 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.uint8
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint8. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint8_t rng, off, out_val
+    cdef uint8_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.uint8)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0UL:
+            raise ValueError("low is out of bounds for uint8")
+        if high > 0XFFUL:
+            raise ValueError("high is out of bounds for uint8")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint8_t>(high - low)
+        off = <uint8_t>(<uint8_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint8_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.uint8(<uint8_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.uint8)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint8_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint8_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_uint8_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_bool(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_bool(low, high, size, use_masked, *state, lock)
+
+    Return random np.bool integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.bool type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.bool
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for bool. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef bool_t rng, off, out_val
+    cdef bool_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.bool)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0UL:
+            raise ValueError("low is out of bounds for bool")
+        if high > 0x1UL:
+            raise ValueError("high is out of bounds for bool")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <bool_t>(high - low)
+        off = <bool_t>(<bool_t>low)
+        if size is None:
+            with lock:
+                random_bounded_bool_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.bool_(<bool_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.bool)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <bool_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_bool_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_bool_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_int64(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_int64(low, high, size, use_masked, *state, lock)
+
+    Return random np.int64 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.int64 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.int64
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint64. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint64_t rng, off, out_val
+    cdef uint64_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.int64)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < -0x8000000000000000LL:
+            raise ValueError("low is out of bounds for int64")
+        if high > 0x7FFFFFFFFFFFFFFFL:
+            raise ValueError("high is out of bounds for int64")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint64_t>(high - low)
+        off = <uint64_t>(<int64_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint64_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.int64(<int64_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.int64)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint64_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint64_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_int64_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_int32(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_int32(low, high, size, use_masked, *state, lock)
+
+    Return random np.int32 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.int32 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.int32
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint32. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint32_t rng, off, out_val
+    cdef uint32_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.int32)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < -0x80000000L:
+            raise ValueError("low is out of bounds for int32")
+        if high > 0x7FFFFFFFL:
+            raise ValueError("high is out of bounds for int32")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint32_t>(high - low)
+        off = <uint32_t>(<int32_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint32_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.int32(<int32_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.int32)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint32_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint32_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_int32_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_int16(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_int16(low, high, size, use_masked, *state, lock)
+
+    Return random np.int16 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.int16 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.int16
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint16. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint16_t rng, off, out_val
+    cdef uint16_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.int16)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < -0x8000L:
+            raise ValueError("low is out of bounds for int16")
+        if high > 0x7FFFL:
+            raise ValueError("high is out of bounds for int16")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint16_t>(high - low)
+        off = <uint16_t>(<int16_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint16_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.int16(<int16_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.int16)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint16_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint16_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_int16_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_int8(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_int8(low, high, size, use_masked, *state, lock)
+
+    Return random np.int8 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.int8 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.int8
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint8. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint8_t rng, off, out_val
+    cdef uint8_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.int8)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < -0x80L:
+            raise ValueError("low is out of bounds for int8")
+        if high > 0x7FL:
+            raise ValueError("high is out of bounds for int8")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint8_t>(high - low)
+        off = <uint8_t>(<int8_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint8_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.int8(<int8_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.int8)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint8_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint8_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_int8_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
diff --git a/numpy/random/bounded_integers.pyx.in b/numpy/random/_bounded_integers.pyx.in
index 411b65a37..47cb13b3a 100644
--- a/numpy/random/bounded_integers.pyx.in
+++ b/numpy/random/_bounded_integers.pyx.in
@@ -4,12 +4,54 @@
 import numpy as np
 cimport numpy as np
 
-from .distributions cimport *
-
 __all__ = []
 
 np.import_array()
 
+cdef extern from "include/distributions.h":
+    # Generate random numbers in closed interval [off, off + rng].
+    uint64_t random_bounded_uint64(bitgen_t *bitgen_state,
+                                   uint64_t off, uint64_t rng,
+                                   uint64_t mask, bint use_masked) nogil
+    uint32_t random_buffered_bounded_uint32(bitgen_t *bitgen_state,
+                                            uint32_t off, uint32_t rng,
+                                            uint32_t mask, bint use_masked,
+                                            int *bcnt, uint32_t *buf) nogil
+    uint16_t random_buffered_bounded_uint16(bitgen_t *bitgen_state,
+                                            uint16_t off, uint16_t rng,
+                                            uint16_t mask, bint use_masked,
+                                            int *bcnt, uint32_t *buf) nogil
+    uint8_t random_buffered_bounded_uint8(bitgen_t *bitgen_state,
+                                          uint8_t off, uint8_t rng,
+                                          uint8_t mask, bint use_masked,
+                                          int *bcnt, uint32_t *buf) nogil
+    np.npy_bool random_buffered_bounded_bool(bitgen_t *bitgen_state,
+                                             np.npy_bool off, np.npy_bool rng,
+                                             np.npy_bool mask, bint use_masked,
+                                             int *bcnt, uint32_t *buf) nogil
+    void random_bounded_uint64_fill(bitgen_t *bitgen_state,
+                                    uint64_t off, uint64_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint64_t *out) nogil
+    void random_bounded_uint32_fill(bitgen_t *bitgen_state,
+                                    uint32_t off, uint32_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint32_t *out) nogil
+    void random_bounded_uint16_fill(bitgen_t *bitgen_state,
+                                    uint16_t off, uint16_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint16_t *out) nogil
+    void random_bounded_uint8_fill(bitgen_t *bitgen_state,
+                                   uint8_t off, uint8_t rng, np.npy_intp cnt,
+                                   bint use_masked,
+                                   uint8_t *out) nogil
+    void random_bounded_bool_fill(bitgen_t *bitgen_state,
+                                  np.npy_bool off, np.npy_bool rng, np.npy_intp cnt,
+                                  bint use_masked,
+                                  np.npy_bool *out) nogil
+
+
+
 _integers_types = {'bool': (0, 2),
                  'int8': (-2**7, 2**7),
                  'int16': (-2**15, 2**15),
diff --git a/numpy/random/common.pxd b/numpy/random/_common.pxd
index ac0a94bb0..74bebca83 100644
--- a/numpy/random/common.pxd
+++ b/numpy/random/_common.pxd
@@ -1,23 +1,12 @@
 #cython: language_level=3
 
-from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t,
-                          int8_t, int16_t, int32_t, int64_t, intptr_t,
-                          uintptr_t)
-from libc.math cimport sqrt
-
-cdef extern from "src/bitgen.h":
-    struct bitgen:
-        void *state
-        uint64_t (*next_uint64)(void *st) nogil
-        uint32_t (*next_uint32)(void *st) nogil
-        double (*next_double)(void *st) nogil
-        uint64_t (*next_raw)(void *st) nogil
-
-    ctypedef bitgen bitgen_t
+from libc.stdint cimport uint32_t, uint64_t, int32_t, int64_t
 
 import numpy as np
 cimport numpy as np
 
+from ._bit_generator cimport bitgen_t
+
 cdef double POISSON_LAM_MAX
 cdef double LEGACY_POISSON_LAM_MAX
 cdef uint64_t MAXSIZE
@@ -44,7 +33,7 @@ cdef object prepare_ctypes(bitgen_t *bitgen)
 cdef int check_constraint(double val, object name, constraint_type cons) except -1
 cdef int check_array_constraint(np.ndarray val, object name, constraint_type cons) except -1
 
-cdef extern from "src/aligned_malloc/aligned_malloc.h":
+cdef extern from "include/aligned_malloc.h":
     cdef void *PyArray_realloc_aligned(void *p, size_t n)
     cdef void *PyArray_malloc_aligned(size_t n)
     cdef void *PyArray_calloc_aligned(size_t n, size_t s)
@@ -56,6 +45,7 @@ ctypedef double (*random_double_1)(void *state, double a) nogil
 ctypedef double (*random_double_2)(void *state, double a, double b) nogil
 ctypedef double (*random_double_3)(void *state, double a, double b, double c) nogil
 
+ctypedef double (*random_float_fill)(bitgen_t *state, np.npy_intp count, float* out) nogil
 ctypedef float (*random_float_0)(bitgen_t *state) nogil
 ctypedef float (*random_float_1)(bitgen_t *state, float a) nogil
 
diff --git a/numpy/random/common.pyx b/numpy/random/_common.pyx
index 74cd5f033..ef1afac7c 100644
--- a/numpy/random/common.pyx
+++ b/numpy/random/_common.pyx
@@ -6,7 +6,7 @@ import sys
 import numpy as np
 cimport numpy as np
 
-from .common cimport *
+from libc.stdint cimport uintptr_t
 
 __all__ = ['interface']
 
@@ -262,14 +262,16 @@ cdef object double_fill(void *func, bitgen_t *state, object size, object lock, o
     return out_array
 
 cdef object float_fill(void *func, bitgen_t *state, object size, object lock, object out):
-    cdef random_float_0 random_func = (<random_float_0>func)
+    cdef random_float_fill random_func = (<random_float_fill>func)
+    cdef float out_val
     cdef float *out_array_data
     cdef np.ndarray out_array
     cdef np.npy_intp i, n
 
     if size is None and out is None:
         with lock:
-            return random_func(state)
+            random_func(state, 1, &out_val)
+            return out_val
 
     if out is not None:
         check_output(out, np.float32, size)
@@ -280,8 +282,7 @@ cdef object float_fill(void *func, bitgen_t *state, object size, object lock, ob
     n = np.PyArray_SIZE(out_array)
     out_array_data = <float *>np.PyArray_DATA(out_array)
     with lock, nogil:
-        for i in range(n):
-            out_array_data[i] = random_func(state)
+        random_func(state, n, out_array_data)
     return out_array
 
 cdef object float_fill_from_double(void *func, bitgen_t *state, object size, object lock, object out):
diff --git a/numpy/random/generator.pyx b/numpy/random/_generator.pyx
index df7485a97..22b17ab03 100644
--- a/numpy/random/generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -3,34 +3,128 @@
 import operator
 import warnings
 
-import numpy as np
-from numpy.core.multiarray import normalize_axis_index
-
-from .bounded_integers import _integers_types
-from .pcg64 import PCG64
-
 from cpython.pycapsule cimport PyCapsule_IsValid, PyCapsule_GetPointer
 from cpython cimport (Py_INCREF, PyFloat_AsDouble)
-from libc cimport string
 
 cimport cython
+import numpy as np
 cimport numpy as np
+from numpy.core.multiarray import normalize_axis_index
 
-from .bounded_integers cimport *
-from .common cimport *
-from .distributions cimport *
-
-
-__all__ = ['Generator', 'beta', 'binomial', 'bytes', 'chisquare', 'choice',
-           'dirichlet', 'exponential', 'f', 'gamma',
-           'geometric', 'gumbel', 'hypergeometric', 'integers', 'laplace',
-           'logistic', 'lognormal', 'logseries', 'multinomial',
-           'multivariate_normal', 'negative_binomial', 'noncentral_chisquare',
-           'noncentral_f', 'normal', 'pareto', 'permutation',
-           'poisson', 'power', 'random',  'rayleigh', 'shuffle',
-           'standard_cauchy', 'standard_exponential', 'standard_gamma',
-           'standard_normal', 'standard_t', 'triangular',
-           'uniform', 'vonmises', 'wald', 'weibull', 'zipf']
+from libc cimport string
+from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t,
+                          int32_t, int64_t)
+from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64,
+         _rand_int16, _rand_int8, _rand_uint64, _rand_uint32, _rand_uint16,
+         _rand_uint8, _gen_mask)
+from ._bounded_integers import _integers_types
+from ._pcg64 import PCG64
+from ._bit_generator cimport bitgen_t
+from ._common cimport (POISSON_LAM_MAX, CONS_POSITIVE, CONS_NONE,
+            CONS_NON_NEGATIVE, CONS_BOUNDED_0_1, CONS_BOUNDED_GT_0_1,
+            CONS_GT_1, CONS_POSITIVE_NOT_NAN, CONS_POISSON,
+            double_fill, cont, kahan_sum, cont_broadcast_3, float_fill, cont_f,
+            check_array_constraint, check_constraint, disc, discrete_broadcast_iii,
+        )
+
+
+cdef extern from "include/distributions.h":
+
+    struct s_binomial_t:
+        int has_binomial
+        double psave
+        int64_t nsave
+        double r
+        double q
+        double fm
+        int64_t m
+        double p1
+        double xm
+        double xl
+        double xr
+        double c
+        double laml
+        double lamr
+        double p2
+        double p3
+        double p4
+
+    ctypedef s_binomial_t binomial_t
+
+    double random_standard_uniform(bitgen_t *bitgen_state) nogil
+    void random_standard_uniform_fill(bitgen_t* bitgen_state, np.npy_intp cnt, double *out) nogil
+    double random_standard_exponential(bitgen_t *bitgen_state) nogil
+    void random_standard_exponential_fill(bitgen_t *bitgen_state, np.npy_intp cnt, double *out) nogil
+    double random_standard_exponential_zig(bitgen_t *bitgen_state) nogil
+    void random_standard_exponential_zig_fill(bitgen_t *bitgen_state, np.npy_intp cnt, double *out) nogil
+    double random_standard_normal(bitgen_t* bitgen_state) nogil
+    void random_standard_normal_fill(bitgen_t *bitgen_state, np.npy_intp count, double *out) nogil
+    void random_standard_normal_fill_f(bitgen_t *bitgen_state, np.npy_intp count, float *out) nogil
+    double random_standard_gamma(bitgen_t *bitgen_state, double shape) nogil
+
+    float random_standard_uniform_f(bitgen_t *bitgen_state) nogil
+    void random_standard_uniform_fill_f(bitgen_t* bitgen_state, np.npy_intp cnt, float *out) nogil
+    float random_standard_exponential_f(bitgen_t *bitgen_state) nogil
+    float random_standard_exponential_zig_f(bitgen_t *bitgen_state) nogil
+    void random_standard_exponential_fill_f(bitgen_t *bitgen_state, np.npy_intp cnt, float *out) nogil
+    void random_standard_exponential_zig_fill_f(bitgen_t *bitgen_state, np.npy_intp cnt, float *out) nogil
+    float random_standard_normal_f(bitgen_t* bitgen_state) nogil
+    float random_standard_gamma_f(bitgen_t *bitgen_state, float shape) nogil
+
+    int64_t random_positive_int64(bitgen_t *bitgen_state) nogil
+    int32_t random_positive_int32(bitgen_t *bitgen_state) nogil
+    int64_t random_positive_int(bitgen_t *bitgen_state) nogil
+    uint64_t random_uint(bitgen_t *bitgen_state) nogil
+
+    double random_normal(bitgen_t *bitgen_state, double loc, double scale) nogil
+
+    double random_gamma(bitgen_t *bitgen_state, double shape, double scale) nogil
+    float random_gamma_f(bitgen_t *bitgen_state, float shape, float scale) nogil
+
+    double random_exponential(bitgen_t *bitgen_state, double scale) nogil
+    double random_uniform(bitgen_t *bitgen_state, double lower, double range) nogil
+    double random_beta(bitgen_t *bitgen_state, double a, double b) nogil
+    double random_chisquare(bitgen_t *bitgen_state, double df) nogil
+    double random_f(bitgen_t *bitgen_state, double dfnum, double dfden) nogil
+    double random_standard_cauchy(bitgen_t *bitgen_state) nogil
+    double random_pareto(bitgen_t *bitgen_state, double a) nogil
+    double random_weibull(bitgen_t *bitgen_state, double a) nogil
+    double random_power(bitgen_t *bitgen_state, double a) nogil
+    double random_laplace(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_gumbel(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_logistic(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_lognormal(bitgen_t *bitgen_state, double mean, double sigma) nogil
+    double random_rayleigh(bitgen_t *bitgen_state, double mode) nogil
+    double random_standard_t(bitgen_t *bitgen_state, double df) nogil
+    double random_noncentral_chisquare(bitgen_t *bitgen_state, double df,
+                                       double nonc) nogil
+    double random_noncentral_f(bitgen_t *bitgen_state, double dfnum,
+                               double dfden, double nonc) nogil
+    double random_wald(bitgen_t *bitgen_state, double mean, double scale) nogil
+    double random_vonmises(bitgen_t *bitgen_state, double mu, double kappa) nogil
+    double random_triangular(bitgen_t *bitgen_state, double left, double mode,
+                             double right) nogil
+
+    int64_t random_poisson(bitgen_t *bitgen_state, double lam) nogil
+    int64_t random_negative_binomial(bitgen_t *bitgen_state, double n, double p) nogil
+    int64_t random_binomial(bitgen_t *bitgen_state, double p, int64_t n, binomial_t *binomial) nogil
+    int64_t random_logseries(bitgen_t *bitgen_state, double p) nogil
+    int64_t random_geometric_search(bitgen_t *bitgen_state, double p) nogil
+    int64_t random_geometric_inversion(bitgen_t *bitgen_state, double p) nogil
+    int64_t random_geometric(bitgen_t *bitgen_state, double p) nogil
+    int64_t random_zipf(bitgen_t *bitgen_state, double a) nogil
+    int64_t random_hypergeometric(bitgen_t *bitgen_state, int64_t good, int64_t bad,
+                                    int64_t sample) nogil
+
+    uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max) nogil
+
+    # Generate random uint64 numbers in closed interval [off, off + rng].
+    uint64_t random_bounded_uint64(bitgen_t *bitgen_state,
+                                   uint64_t off, uint64_t rng,
+                                   uint64_t mask, bint use_masked) nogil
+
+    void random_multinomial(bitgen_t *bitgen_state, int64_t n, int64_t *mnix,
+                            double *pix, np.npy_intp d, binomial_t *binomial) nogil
 
 np.import_array()
 
@@ -193,9 +287,9 @@ cdef class Generator:
         cdef double temp
         key = np.dtype(dtype).name
         if key == 'float64':
-            return double_fill(&random_double_fill, &self._bitgen, size, self.lock, out)
+            return double_fill(&random_standard_uniform_fill, &self._bitgen, size, self.lock, out)
         elif key == 'float32':
-            return float_fill(&random_float, &self._bitgen, size, self.lock, out)
+            return float_fill(&random_standard_uniform_fill_f, &self._bitgen, size, self.lock, out)
         else:
             raise TypeError('Unsupported dtype "%s" for random' % key)
 
@@ -341,9 +435,9 @@ cdef class Generator:
                 return double_fill(&random_standard_exponential_fill, &self._bitgen, size, self.lock, out)
         elif key == 'float32':
             if method == u'zig':
-                return float_fill(&random_standard_exponential_zig_f, &self._bitgen, size, self.lock, out)
+                return float_fill(&random_standard_exponential_zig_fill_f, &self._bitgen, size, self.lock, out)
             else:
-                return float_fill(&random_standard_exponential_f, &self._bitgen, size, self.lock, out)
+                return float_fill(&random_standard_exponential_fill_f, &self._bitgen, size, self.lock, out)
         else:
             raise TypeError('Unsupported dtype "%s" for standard_exponential'
                             % key)
@@ -920,9 +1014,9 @@ cdef class Generator:
         """
         key = np.dtype(dtype).name
         if key == 'float64':
-            return double_fill(&random_gauss_zig_fill, &self._bitgen, size, self.lock, out)
+            return double_fill(&random_standard_normal_fill, &self._bitgen, size, self.lock, out)
         elif key == 'float32':
-            return float_fill(&random_gauss_zig_f, &self._bitgen, size, self.lock, out)
+            return float_fill(&random_standard_normal_fill_f, &self._bitgen, size, self.lock, out)
 
         else:
             raise TypeError('Unsupported dtype "%s" for standard_normal' % key)
@@ -1023,7 +1117,7 @@ cdef class Generator:
                [ 0.39924804,  4.68456316,  4.99394529,  4.84057254]])  # random
 
         """
-        return cont(&random_normal_zig, &self._bitgen, size, self.lock, 2,
+        return cont(&random_normal, &self._bitgen, size, self.lock, 2,
                     loc, '', CONS_NONE,
                     scale, 'scale', CONS_NON_NEGATIVE,
                     0.0, '', CONS_NONE,
@@ -1109,13 +1203,13 @@ cdef class Generator:
         cdef void *func
         key = np.dtype(dtype).name
         if key == 'float64':
-            return cont(&random_standard_gamma_zig, &self._bitgen, size, self.lock, 1,
+            return cont(&random_standard_gamma, &self._bitgen, size, self.lock, 1,
                         shape, 'shape', CONS_NON_NEGATIVE,
                         0.0, '', CONS_NONE,
                         0.0, '', CONS_NONE,
                         out)
         if key == 'float32':
-            return cont_f(&random_standard_gamma_zig_f, &self._bitgen, size, self.lock,
+            return cont_f(&random_standard_gamma_f, &self._bitgen, size, self.lock,
                           shape, 'shape', CONS_NON_NEGATIVE,
                           out)
         else:
@@ -3773,7 +3867,7 @@ cdef class Generator:
             while i < totsize:
                 acc = 0.0
                 for j in range(k):
-                    val_data[i+j] = random_standard_gamma_zig(&self._bitgen,
+                    val_data[i+j] = random_standard_gamma(&self._bitgen,
                                                               alpha_data[j])
                     acc = acc + val_data[i + j]
                 invacc = 1/acc
@@ -4003,19 +4097,18 @@ def default_rng(seed=None):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence, BitGenerator, Generator}, optional
+    seed : {None, int, array_like[ints], SeedSequence, BitGenerator, Generator}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a`SeedSequence` instance
         Additionally, when passed a `BitGenerator`, it will be wrapped by
         `Generator`. If passed a `Generator`, it will be returned unaltered.
 
     Notes
     -----
-    When `seed` is omitted or ``None``, a new `BitGenerator` and `Generator` will
+    When ``seed`` is omitted or ``None``, a new `BitGenerator` and `Generator` will
     be instantiated each time. This function does not manage a default global
     instance.
     """
diff --git a/numpy/random/mt19937.pyx b/numpy/random/_mt19937.pyx
index 7d0f6cd22..e99652b73 100644
--- a/numpy/random/mt19937.pyx
+++ b/numpy/random/_mt19937.pyx
@@ -3,8 +3,8 @@ import operator
 import numpy as np
 cimport numpy as np
 
-from .common cimport *
-from .bit_generator cimport BitGenerator, SeedSequence
+from libc.stdint cimport uint32_t, uint64_t
+from ._bit_generator cimport BitGenerator, SeedSequence
 
 __all__ = ['MT19937']
 
@@ -48,13 +48,12 @@ cdef class MT19937(BitGenerator):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a `SeedSequence` instance.
 
     Attributes
     ----------
diff --git a/numpy/random/pcg64.pyx b/numpy/random/_pcg64.pyx
index 585520139..1a5d852a2 100644
--- a/numpy/random/pcg64.pyx
+++ b/numpy/random/_pcg64.pyx
@@ -1,8 +1,9 @@
 import numpy as np
 cimport numpy as np
 
-from .common cimport *
-from .bit_generator cimport BitGenerator
+from libc.stdint cimport uint32_t, uint64_t
+from ._common cimport uint64_to_double, wrap_int
+from ._bit_generator cimport BitGenerator
 
 __all__ = ['PCG64']
 
@@ -43,13 +44,12 @@ cdef class PCG64(BitGenerator):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a `SeedSequence` instance.
 
     Notes
     -----
diff --git a/numpy/random/philox.pyx b/numpy/random/_philox.pyx
index 8b7683017..9f136c32f 100644
--- a/numpy/random/philox.pyx
+++ b/numpy/random/_philox.pyx
@@ -6,9 +6,11 @@ except ImportError:
     from dummy_threading import Lock
 
 import numpy as np
+cimport numpy as np
 
-from .common cimport *
-from .bit_generator cimport BitGenerator
+from libc.stdint cimport uint32_t, uint64_t
+from ._common cimport uint64_to_double, int_to_array, wrap_int
+from ._bit_generator cimport BitGenerator
 
 __all__ = ['Philox']
 
@@ -62,21 +64,20 @@ cdef class Philox(BitGenerator):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a `SeedSequence` instance.
     counter : {None, int, array_like}, optional
         Counter to use in the Philox state. Can be either
         a Python int (long in 2.x) in [0, 2**256) or a 4-element uint64 array.
         If not provided, the RNG is initialized at 0.
     key : {None, int, array_like}, optional
-        Key to use in the Philox state.  Unlike seed, the value in key is
+        Key to use in the Philox state.  Unlike ``seed``, the value in key is
         directly set. Can be either a Python int in [0, 2**128) or a 2-element
-        uint64 array. `key` and `seed` cannot both be used.
+        uint64 array. `key` and ``seed`` cannot both be used.
 
     Attributes
     ----------
@@ -108,10 +109,10 @@ cdef class Philox(BitGenerator):
     randoms produced. The second is a key which determined the sequence
     produced. Using different keys produces independent sequences.
 
-    The input seed is processed by `SeedSequence` to generate the key. The
+    The input ``seed`` is processed by `SeedSequence` to generate the key. The
     counter is set to 0.
 
-    Alternately, one can omit the seed parameter and set the ``key`` and
+    Alternately, one can omit the ``seed`` parameter and set the ``key`` and
     ``counter`` directly.
 
     **Parallel Features**
@@ -146,7 +147,7 @@ cdef class Philox(BitGenerator):
 
     **Compatibility Guarantee**
 
-    ``Philox`` makes a guarantee that a fixed seed will always produce
+    ``Philox`` makes a guarantee that a fixed ``seed`` will always produce
     the same random integer stream.
 
     Examples
diff --git a/numpy/random/_pickle.py b/numpy/random/_pickle.py
index 3b58f21e8..29ff69644 100644
--- a/numpy/random/_pickle.py
+++ b/numpy/random/_pickle.py
@@ -1,10 +1,10 @@
 from .mtrand import RandomState
-from .philox import Philox
-from .pcg64 import PCG64
-from .sfc64 import SFC64
+from ._philox import Philox
+from ._pcg64 import PCG64
+from ._sfc64 import SFC64
 
-from .generator import Generator
-from .mt19937 import MT19937
+from ._generator import Generator
+from ._mt19937 import MT19937
 
 BitGenerators = {'MT19937': MT19937,
                  'PCG64': PCG64,
diff --git a/numpy/random/sfc64.pyx b/numpy/random/_sfc64.pyx
index a881096e9..1633669d5 100644
--- a/numpy/random/sfc64.pyx
+++ b/numpy/random/_sfc64.pyx
@@ -1,8 +1,9 @@
 import numpy as np
 cimport numpy as np
 
-from .common cimport *
-from .bit_generator cimport BitGenerator
+from libc.stdint cimport uint32_t, uint64_t
+from ._common cimport uint64_to_double
+from ._bit_generator cimport BitGenerator
 
 __all__ = ['SFC64']
 
@@ -38,13 +39,12 @@ cdef class SFC64(BitGenerator):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a `SeedSequence` instance.
 
     Notes
     -----
diff --git a/numpy/random/distributions.pxd b/numpy/random/distributions.pxd
deleted file mode 100644
index 75edaee9d..000000000
--- a/numpy/random/distributions.pxd
+++ /dev/null
@@ -1,140 +0,0 @@
-#cython: language_level=3
-
-from .common cimport (uint8_t, uint16_t, uint32_t, uint64_t,
-                          int32_t, int64_t, bitgen_t)
-import numpy as np
-cimport numpy as np
-
-cdef extern from "src/distributions/distributions.h":
-
-    struct s_binomial_t:
-        int has_binomial
-        double psave
-        int64_t nsave
-        double r
-        double q
-        double fm
-        int64_t m
-        double p1
-        double xm
-        double xl
-        double xr
-        double c
-        double laml
-        double lamr
-        double p2
-        double p3
-        double p4
-
-    ctypedef s_binomial_t binomial_t
-
-    double random_double(bitgen_t *bitgen_state) nogil
-    void random_double_fill(bitgen_t* bitgen_state, np.npy_intp cnt, double *out) nogil
-    double random_standard_exponential(bitgen_t *bitgen_state) nogil
-    void random_standard_exponential_fill(bitgen_t *bitgen_state, np.npy_intp cnt, double *out) nogil
-    double random_standard_exponential_zig(bitgen_t *bitgen_state) nogil
-    void random_standard_exponential_zig_fill(bitgen_t *bitgen_state, np.npy_intp cnt, double *out) nogil
-    double random_gauss_zig(bitgen_t* bitgen_state) nogil
-    void random_gauss_zig_fill(bitgen_t *bitgen_state, np.npy_intp count, double *out) nogil
-    double random_standard_gamma_zig(bitgen_t *bitgen_state, double shape) nogil
-
-    float random_float(bitgen_t *bitgen_state) nogil
-    float random_standard_exponential_f(bitgen_t *bitgen_state) nogil
-    float random_standard_exponential_zig_f(bitgen_t *bitgen_state) nogil
-    float random_gauss_zig_f(bitgen_t* bitgen_state) nogil
-    float random_standard_gamma_f(bitgen_t *bitgen_state, float shape) nogil
-    float random_standard_gamma_zig_f(bitgen_t *bitgen_state, float shape) nogil
-
-    int64_t random_positive_int64(bitgen_t *bitgen_state) nogil
-    int32_t random_positive_int32(bitgen_t *bitgen_state) nogil
-    int64_t random_positive_int(bitgen_t *bitgen_state) nogil
-    uint64_t random_uint(bitgen_t *bitgen_state) nogil
-
-    double random_normal_zig(bitgen_t *bitgen_state, double loc, double scale) nogil
-
-    double random_gamma(bitgen_t *bitgen_state, double shape, double scale) nogil
-    float random_gamma_float(bitgen_t *bitgen_state, float shape, float scale) nogil
-
-    double random_exponential(bitgen_t *bitgen_state, double scale) nogil
-    double random_uniform(bitgen_t *bitgen_state, double lower, double range) nogil
-    double random_beta(bitgen_t *bitgen_state, double a, double b) nogil
-    double random_chisquare(bitgen_t *bitgen_state, double df) nogil
-    double random_f(bitgen_t *bitgen_state, double dfnum, double dfden) nogil
-    double random_standard_cauchy(bitgen_t *bitgen_state) nogil
-    double random_pareto(bitgen_t *bitgen_state, double a) nogil
-    double random_weibull(bitgen_t *bitgen_state, double a) nogil
-    double random_power(bitgen_t *bitgen_state, double a) nogil
-    double random_laplace(bitgen_t *bitgen_state, double loc, double scale) nogil
-    double random_gumbel(bitgen_t *bitgen_state, double loc, double scale) nogil
-    double random_logistic(bitgen_t *bitgen_state, double loc, double scale) nogil
-    double random_lognormal(bitgen_t *bitgen_state, double mean, double sigma) nogil
-    double random_rayleigh(bitgen_t *bitgen_state, double mode) nogil
-    double random_standard_t(bitgen_t *bitgen_state, double df) nogil
-    double random_noncentral_chisquare(bitgen_t *bitgen_state, double df,
-                                       double nonc) nogil
-    double random_noncentral_f(bitgen_t *bitgen_state, double dfnum,
-                               double dfden, double nonc) nogil
-    double random_wald(bitgen_t *bitgen_state, double mean, double scale) nogil
-    double random_vonmises(bitgen_t *bitgen_state, double mu, double kappa) nogil
-    double random_triangular(bitgen_t *bitgen_state, double left, double mode,
-                             double right) nogil
-
-    int64_t random_poisson(bitgen_t *bitgen_state, double lam) nogil
-    int64_t random_negative_binomial(bitgen_t *bitgen_state, double n, double p) nogil
-    int64_t random_binomial(bitgen_t *bitgen_state, double p, int64_t n, binomial_t *binomial) nogil
-    int64_t random_logseries(bitgen_t *bitgen_state, double p) nogil
-    int64_t random_geometric_search(bitgen_t *bitgen_state, double p) nogil
-    int64_t random_geometric_inversion(bitgen_t *bitgen_state, double p) nogil
-    int64_t random_geometric(bitgen_t *bitgen_state, double p) nogil
-    int64_t random_zipf(bitgen_t *bitgen_state, double a) nogil
-    int64_t random_hypergeometric(bitgen_t *bitgen_state, int64_t good, int64_t bad,
-                                    int64_t sample) nogil
-
-    uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max) nogil
-
-    # Generate random uint64 numbers in closed interval [off, off + rng].
-    uint64_t random_bounded_uint64(bitgen_t *bitgen_state,
-                                   uint64_t off, uint64_t rng,
-                                   uint64_t mask, bint use_masked) nogil
-
-    # Generate random uint32 numbers in closed interval [off, off + rng].
-    uint32_t random_buffered_bounded_uint32(bitgen_t *bitgen_state,
-                                            uint32_t off, uint32_t rng,
-                                            uint32_t mask, bint use_masked,
-                                            int *bcnt, uint32_t *buf) nogil
-    uint16_t random_buffered_bounded_uint16(bitgen_t *bitgen_state,
-                                            uint16_t off, uint16_t rng,
-                                            uint16_t mask, bint use_masked,
-                                            int *bcnt, uint32_t *buf) nogil
-    uint8_t random_buffered_bounded_uint8(bitgen_t *bitgen_state,
-                                          uint8_t off, uint8_t rng,
-                                          uint8_t mask, bint use_masked,
-                                          int *bcnt, uint32_t *buf) nogil
-    np.npy_bool random_buffered_bounded_bool(bitgen_t *bitgen_state,
-                                             np.npy_bool off, np.npy_bool rng,
-                                             np.npy_bool mask, bint use_masked,
-                                             int *bcnt, uint32_t *buf) nogil
-
-    void random_bounded_uint64_fill(bitgen_t *bitgen_state,
-                                    uint64_t off, uint64_t rng, np.npy_intp cnt,
-                                    bint use_masked,
-                                    uint64_t *out) nogil
-    void random_bounded_uint32_fill(bitgen_t *bitgen_state,
-                                    uint32_t off, uint32_t rng, np.npy_intp cnt,
-                                    bint use_masked,
-                                    uint32_t *out) nogil
-    void random_bounded_uint16_fill(bitgen_t *bitgen_state,
-                                    uint16_t off, uint16_t rng, np.npy_intp cnt,
-                                    bint use_masked,
-                                    uint16_t *out) nogil
-    void random_bounded_uint8_fill(bitgen_t *bitgen_state,
-                                   uint8_t off, uint8_t rng, np.npy_intp cnt,
-                                   bint use_masked,
-                                   uint8_t *out) nogil
-    void random_bounded_bool_fill(bitgen_t *bitgen_state,
-                                  np.npy_bool off, np.npy_bool rng, np.npy_intp cnt,
-                                  bint use_masked,
-                                  np.npy_bool *out) nogil
-
-    void random_multinomial(bitgen_t *bitgen_state, int64_t n, int64_t *mnix,
-                            double *pix, np.npy_intp d, binomial_t *binomial) nogil
diff --git a/numpy/random/src/aligned_malloc/aligned_malloc.h b/numpy/random/include/aligned_malloc.h
index ea24f6d23..ea24f6d23 100644
--- a/numpy/random/src/aligned_malloc/aligned_malloc.h
+++ b/numpy/random/include/aligned_malloc.h
diff --git a/numpy/random/src/bitgen.h b/numpy/random/include/bitgen.h
index 0adaaf2ee..83c2858dd 100644
--- a/numpy/random/src/bitgen.h
+++ b/numpy/random/include/bitgen.h
@@ -6,7 +6,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 
-/* Must match the declaration in numpy/random/common.pxd */
+/* Must match the declaration in numpy/random/<any>.pxd */
 
 typedef struct bitgen {
   void *state;
diff --git a/numpy/random/src/distributions/distributions.h b/numpy/random/include/distributions.h
index 2a6b2a045..fb69c7d2c 100644
--- a/numpy/random/src/distributions/distributions.h
+++ b/numpy/random/include/distributions.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 
 #include "numpy/npy_math.h"
-#include "src/bitgen.h"
+#include "include/bitgen.h"
 
 /*
  * RAND_INT_TYPE is used to share integer generators with RandomState which
@@ -59,28 +59,10 @@ typedef struct s_binomial_t {
   double p4;
 } binomial_t;
 
-/* Inline generators for internal use */
-static NPY_INLINE uint32_t next_uint32(bitgen_t *bitgen_state) {
-  return bitgen_state->next_uint32(bitgen_state->state);
-}
-
-static NPY_INLINE uint64_t next_uint64(bitgen_t *bitgen_state) {
-  return bitgen_state->next_uint64(bitgen_state->state);
-}
-
-static NPY_INLINE float next_float(bitgen_t *bitgen_state) {
-  return (next_uint32(bitgen_state) >> 9) * (1.0f / 8388608.0f);
-}
-
-static NPY_INLINE double next_double(bitgen_t *bitgen_state) {
-  return bitgen_state->next_double(bitgen_state->state);
-}
-
-DECLDIR double loggam(double x);
-
-DECLDIR float random_float(bitgen_t *bitgen_state);
-DECLDIR double random_double(bitgen_t *bitgen_state);
-DECLDIR void random_double_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out);
+DECLDIR float random_standard_uniform_f(bitgen_t *bitgen_state);
+DECLDIR double random_standard_uniform(bitgen_t *bitgen_state);
+DECLDIR void random_standard_uniform_fill(bitgen_t *, npy_intp, double *);
+DECLDIR void random_standard_uniform_fill_f(bitgen_t *, npy_intp, float *);
 
 DECLDIR int64_t random_positive_int64(bitgen_t *bitgen_state);
 DECLDIR int32_t random_positive_int32(bitgen_t *bitgen_state);
@@ -88,37 +70,25 @@ DECLDIR int64_t random_positive_int(bitgen_t *bitgen_state);
 DECLDIR uint64_t random_uint(bitgen_t *bitgen_state);
 
 DECLDIR double random_standard_exponential(bitgen_t *bitgen_state);
-DECLDIR void random_standard_exponential_fill(bitgen_t *bitgen_state, npy_intp cnt,
-                                              double *out);
 DECLDIR float random_standard_exponential_f(bitgen_t *bitgen_state);
 DECLDIR double random_standard_exponential_zig(bitgen_t *bitgen_state);
-DECLDIR void random_standard_exponential_zig_fill(bitgen_t *bitgen_state,
-                                                  npy_intp cnt, double *out);
 DECLDIR float random_standard_exponential_zig_f(bitgen_t *bitgen_state);
-
-/*
-DECLDIR double random_gauss(bitgen_t *bitgen_state);
-DECLDIR float random_gauss_f(bitgen_t *bitgen_state);
-*/
-DECLDIR double random_gauss_zig(bitgen_t *bitgen_state);
-DECLDIR float random_gauss_zig_f(bitgen_t *bitgen_state);
-DECLDIR void random_gauss_zig_fill(bitgen_t *bitgen_state, npy_intp cnt,
-                                   double *out);
-
-/*
+DECLDIR void random_standard_exponential_fill(bitgen_t *, npy_intp, double *);
+DECLDIR void random_standard_exponential_fill_f(bitgen_t *, npy_intp, float *);
+DECLDIR void random_standard_exponential_zig_fill(bitgen_t *, npy_intp, double *);
+DECLDIR void random_standard_exponential_zig_fill_f(bitgen_t *, npy_intp, float *);
+
+DECLDIR double random_standard_normal(bitgen_t *bitgen_state);
+DECLDIR float random_standard_normal_f(bitgen_t *bitgen_state);
+DECLDIR void random_standard_normal_fill(bitgen_t *, npy_intp, double *);
+DECLDIR void random_standard_normal_fill_f(bitgen_t *, npy_intp, float *);
 DECLDIR double random_standard_gamma(bitgen_t *bitgen_state, double shape);
 DECLDIR float random_standard_gamma_f(bitgen_t *bitgen_state, float shape);
-*/
-DECLDIR double random_standard_gamma_zig(bitgen_t *bitgen_state, double shape);
-DECLDIR float random_standard_gamma_zig_f(bitgen_t *bitgen_state, float shape);
 
-/*
 DECLDIR double random_normal(bitgen_t *bitgen_state, double loc, double scale);
-*/
-DECLDIR double random_normal_zig(bitgen_t *bitgen_state, double loc, double scale);
 
 DECLDIR double random_gamma(bitgen_t *bitgen_state, double shape, double scale);
-DECLDIR float random_gamma_float(bitgen_t *bitgen_state, float shape, float scale);
+DECLDIR float random_gamma_f(bitgen_t *bitgen_state, float shape, float scale);
 
 DECLDIR double random_exponential(bitgen_t *bitgen_state, double scale);
 DECLDIR double random_uniform(bitgen_t *bitgen_state, double lower, double range);
@@ -146,27 +116,16 @@ DECLDIR double random_triangular(bitgen_t *bitgen_state, double left, double mod
 
 DECLDIR RAND_INT_TYPE random_poisson(bitgen_t *bitgen_state, double lam);
 DECLDIR RAND_INT_TYPE random_negative_binomial(bitgen_t *bitgen_state, double n,
-                                         double p);
-
-DECLDIR RAND_INT_TYPE random_binomial_btpe(bitgen_t *bitgen_state,
-                                           RAND_INT_TYPE n,
-                                           double p,
-                                           binomial_t *binomial);
-DECLDIR RAND_INT_TYPE random_binomial_inversion(bitgen_t *bitgen_state,
-                                                RAND_INT_TYPE n,
-                                                double p,
-                                                binomial_t *binomial);
+                                 double p);
+
 DECLDIR int64_t random_binomial(bitgen_t *bitgen_state, double p,
                                 int64_t n, binomial_t *binomial);
 
 DECLDIR RAND_INT_TYPE random_logseries(bitgen_t *bitgen_state, double p);
-DECLDIR RAND_INT_TYPE random_geometric_search(bitgen_t *bitgen_state, double p);
-DECLDIR RAND_INT_TYPE random_geometric_inversion(bitgen_t *bitgen_state, double p);
 DECLDIR RAND_INT_TYPE random_geometric(bitgen_t *bitgen_state, double p);
 DECLDIR RAND_INT_TYPE random_zipf(bitgen_t *bitgen_state, double a);
 DECLDIR int64_t random_hypergeometric(bitgen_t *bitgen_state,
                                       int64_t good, int64_t bad, int64_t sample);
-
 DECLDIR uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max);
 
 /* Generate random uint64 numbers in closed interval [off, off + rng]. */
@@ -211,4 +170,19 @@ DECLDIR void random_bounded_bool_fill(bitgen_t *bitgen_state, npy_bool off,
 DECLDIR void random_multinomial(bitgen_t *bitgen_state, RAND_INT_TYPE n, RAND_INT_TYPE *mnix,
                                 double *pix, npy_intp d, binomial_t *binomial);
 
+/* Common to legacy-distributions.c and distributions.c but not exported */
+
+RAND_INT_TYPE random_binomial_btpe(bitgen_t *bitgen_state,
+                                   RAND_INT_TYPE n,
+                                   double p,
+                                   binomial_t *binomial);
+RAND_INT_TYPE random_binomial_inversion(bitgen_t *bitgen_state,
+                                        RAND_INT_TYPE n,
+                                        double p,
+                                        binomial_t *binomial);
+double random_loggam(double x);
+static NPY_INLINE double next_double(bitgen_t *bitgen_state) {
+    return bitgen_state->next_double(bitgen_state->state);
+}
+
 #endif
diff --git a/numpy/random/src/legacy/legacy-distributions.h b/numpy/random/include/legacy-distributions.h
index 4bc15d58e..6a0fc7dc4 100644
--- a/numpy/random/src/legacy/legacy-distributions.h
+++ b/numpy/random/include/legacy-distributions.h
@@ -2,7 +2,7 @@
 #define _RANDOMDGEN__DISTRIBUTIONS_LEGACY_H_
 
 
-#include "../distributions/distributions.h"
+#include "distributions.h"
 
 typedef struct aug_bitgen {
   bitgen_t *bit_generator;
diff --git a/numpy/random/legacy_distributions.pxd b/numpy/random/legacy_distributions.pxd
deleted file mode 100644
index c681388db..000000000
--- a/numpy/random/legacy_distributions.pxd
+++ /dev/null
@@ -1,50 +0,0 @@
-#cython: language_level=3
-
-from libc.stdint cimport int64_t
-
-import numpy as np
-cimport numpy as np
-
-from .distributions cimport bitgen_t, binomial_t
-
-cdef extern from "legacy-distributions.h":
-
-    struct aug_bitgen:
-        bitgen_t *bit_generator
-        int has_gauss
-        double gauss
-
-    ctypedef aug_bitgen aug_bitgen_t
-
-    double legacy_gauss(aug_bitgen_t *aug_state) nogil
-    double legacy_pareto(aug_bitgen_t *aug_state, double a) nogil
-    double legacy_weibull(aug_bitgen_t *aug_state, double a) nogil
-    double legacy_standard_gamma(aug_bitgen_t *aug_state, double shape) nogil
-    double legacy_normal(aug_bitgen_t *aug_state, double loc, double scale) nogil
-    double legacy_standard_t(aug_bitgen_t *aug_state, double df) nogil
-
-    double legacy_standard_exponential(aug_bitgen_t *aug_state) nogil
-    double legacy_power(aug_bitgen_t *aug_state, double a) nogil
-    double legacy_gamma(aug_bitgen_t *aug_state, double shape, double scale) nogil
-    double legacy_power(aug_bitgen_t *aug_state, double a) nogil
-    double legacy_chisquare(aug_bitgen_t *aug_state, double df) nogil
-    double legacy_noncentral_chisquare(aug_bitgen_t *aug_state, double df,
-                                    double nonc) nogil
-    double legacy_noncentral_f(aug_bitgen_t *aug_state, double dfnum, double dfden,
-                            double nonc) nogil
-    double legacy_wald(aug_bitgen_t *aug_state, double mean, double scale) nogil
-    double legacy_lognormal(aug_bitgen_t *aug_state, double mean, double sigma) nogil
-    int64_t legacy_random_binomial(bitgen_t *bitgen_state, double p,
-                                   int64_t n, binomial_t *binomial) nogil
-    int64_t legacy_negative_binomial(aug_bitgen_t *aug_state, double n, double p) nogil
-    int64_t legacy_random_hypergeometric(bitgen_t *bitgen_state, int64_t good, int64_t bad, int64_t sample) nogil
-    int64_t legacy_random_logseries(bitgen_t *bitgen_state, double p) nogil
-    int64_t legacy_random_poisson(bitgen_t *bitgen_state, double lam) nogil
-    int64_t legacy_random_zipf(bitgen_t *bitgen_state, double a) nogil
-    int64_t legacy_random_geometric(bitgen_t *bitgen_state, double p) nogil
-    void legacy_random_multinomial(bitgen_t *bitgen_state, long n, long *mnix, double *pix, np.npy_intp d, binomial_t *binomial) nogil
-    double legacy_standard_cauchy(aug_bitgen_t *state) nogil
-    double legacy_beta(aug_bitgen_t *aug_state, double a, double b) nogil
-    double legacy_f(aug_bitgen_t *aug_state, double dfnum, double dfden) nogil
-    double legacy_exponential(aug_bitgen_t *aug_state, double scale) nogil
-    double legacy_power(aug_bitgen_t *state, double a) nogil
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index c469a4645..683a771cc 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -5,19 +5,100 @@ import warnings
 
 import numpy as np
 
-from .bounded_integers import _integers_types
-from .mt19937 import MT19937 as _MT19937
 from cpython.pycapsule cimport PyCapsule_IsValid, PyCapsule_GetPointer
 from cpython cimport (Py_INCREF, PyFloat_AsDouble)
-from libc cimport string
-
 cimport cython
 cimport numpy as np
 
-from .bounded_integers cimport *
-from .common cimport *
-from .distributions cimport *
-from .legacy_distributions cimport *
+from libc cimport string
+from libc.stdint cimport int64_t, uint64_t
+from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64,
+         _rand_int16, _rand_int8, _rand_uint64, _rand_uint32, _rand_uint16,
+         _rand_uint8,)
+from ._bounded_integers import _integers_types
+from ._mt19937 import MT19937 as _MT19937
+from ._bit_generator cimport bitgen_t
+from ._common cimport (POISSON_LAM_MAX, CONS_POSITIVE, CONS_NONE,
+            CONS_NON_NEGATIVE, CONS_BOUNDED_0_1, CONS_BOUNDED_GT_0_1, CONS_GTE_1,
+            CONS_GT_1, LEGACY_CONS_POISSON,
+            double_fill, cont, kahan_sum, cont_broadcast_3,
+            check_array_constraint, check_constraint, disc, discrete_broadcast_iii,
+        )
+
+cdef extern from "include/distributions.h":
+    struct s_binomial_t:
+        int has_binomial
+        double psave
+        int64_t nsave
+        double r
+        double q
+        double fm
+        int64_t m
+        double p1
+        double xm
+        double xl
+        double xr
+        double c
+        double laml
+        double lamr
+        double p2
+        double p3
+        double p4
+
+    ctypedef s_binomial_t binomial_t
+
+    void random_standard_uniform_fill(bitgen_t* bitgen_state, np.npy_intp cnt, double *out) nogil
+    int64_t random_positive_int(bitgen_t *bitgen_state) nogil
+    double random_uniform(bitgen_t *bitgen_state, double lower, double range) nogil
+    double random_vonmises(bitgen_t *bitgen_state, double mu, double kappa) nogil
+    double random_laplace(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_gumbel(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_logistic(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_rayleigh(bitgen_t *bitgen_state, double mode) nogil
+    double random_triangular(bitgen_t *bitgen_state, double left, double mode,
+                                 double right) nogil
+    uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max) nogil
+
+cdef extern from "include/legacy-distributions.h":
+    struct aug_bitgen:
+        bitgen_t *bit_generator
+        int has_gauss
+        double gauss
+
+    ctypedef aug_bitgen aug_bitgen_t
+
+    double legacy_gauss(aug_bitgen_t *aug_state) nogil
+    double legacy_pareto(aug_bitgen_t *aug_state, double a) nogil
+    double legacy_weibull(aug_bitgen_t *aug_state, double a) nogil
+    double legacy_standard_gamma(aug_bitgen_t *aug_state, double shape) nogil
+    double legacy_normal(aug_bitgen_t *aug_state, double loc, double scale) nogil
+    double legacy_standard_t(aug_bitgen_t *aug_state, double df) nogil
+
+    double legacy_standard_exponential(aug_bitgen_t *aug_state) nogil
+    double legacy_power(aug_bitgen_t *aug_state, double a) nogil
+    double legacy_gamma(aug_bitgen_t *aug_state, double shape, double scale) nogil
+    double legacy_power(aug_bitgen_t *aug_state, double a) nogil
+    double legacy_chisquare(aug_bitgen_t *aug_state, double df) nogil
+    double legacy_noncentral_chisquare(aug_bitgen_t *aug_state, double df,
+                                    double nonc) nogil
+    double legacy_noncentral_f(aug_bitgen_t *aug_state, double dfnum, double dfden,
+                            double nonc) nogil
+    double legacy_wald(aug_bitgen_t *aug_state, double mean, double scale) nogil
+    double legacy_lognormal(aug_bitgen_t *aug_state, double mean, double sigma) nogil
+    int64_t legacy_random_binomial(bitgen_t *bitgen_state, double p,
+                                   int64_t n, binomial_t *binomial) nogil
+    int64_t legacy_negative_binomial(aug_bitgen_t *aug_state, double n, double p) nogil
+    int64_t legacy_random_hypergeometric(bitgen_t *bitgen_state, int64_t good, int64_t bad, int64_t sample) nogil
+    int64_t legacy_random_logseries(bitgen_t *bitgen_state, double p) nogil
+    int64_t legacy_random_poisson(bitgen_t *bitgen_state, double lam) nogil
+    int64_t legacy_random_zipf(bitgen_t *bitgen_state, double a) nogil
+    int64_t legacy_random_geometric(bitgen_t *bitgen_state, double p) nogil
+    void legacy_random_multinomial(bitgen_t *bitgen_state, long n, long *mnix, double *pix, np.npy_intp d, binomial_t *binomial) nogil
+    double legacy_standard_cauchy(aug_bitgen_t *state) nogil
+    double legacy_beta(aug_bitgen_t *aug_state, double a, double b) nogil
+    double legacy_f(aug_bitgen_t *aug_state, double dfnum, double dfden) nogil
+    double legacy_exponential(aug_bitgen_t *aug_state, double scale) nogil
+    double legacy_power(aug_bitgen_t *state, double a) nogil
 
 np.import_array()
 
@@ -84,7 +165,7 @@ cdef class RandomState:
     --------
     Generator
     MT19937
-    :ref:`bit_generator`
+    numpy.random.BitGenerator
 
     """
     cdef public object _bit_generator
@@ -329,7 +410,7 @@ cdef class RandomState:
 
         """
         cdef double temp
-        return double_fill(&random_double_fill, &self._bitgen, size, self.lock, None)
+        return double_fill(&random_standard_uniform_fill, &self._bitgen, size, self.lock, None)
 
     def random(self, size=None):
         """
@@ -567,7 +648,7 @@ cdef class RandomState:
 
         See Also
         --------
-        random.random_integers : similar to `randint`, only for the closed
+        random_integers : similar to `randint`, only for the closed
             interval [`low`, `high`], and 1 is the lowest value if `high` is
             omitted.
 
@@ -985,7 +1066,7 @@ cdef class RandomState:
 
         .. note::
             This is a convenience function for users porting code from Matlab,
-            and wraps `numpy.random.random_sample`. That function takes a
+            and wraps `random_sample`. That function takes a
             tuple to specify the size of the output, which is consistent with
             other NumPy functions like `numpy.zeros` and `numpy.ones`.
 
@@ -1029,7 +1110,7 @@ cdef class RandomState:
 
         .. note::
             This is a convenience function for users porting code from Matlab,
-            and wraps `numpy.random.standard_normal`. That function takes a
+            and wraps `standard_normal`. That function takes a
             tuple to specify the size of the output, which is consistent with
             other NumPy functions like `numpy.zeros` and `numpy.ones`.
 
@@ -1289,8 +1370,8 @@ cdef class RandomState:
         The function has its peak at the mean, and its "spread" increases with
         the standard deviation (the function reaches 0.607 times its maximum at
         :math:`x + \\sigma` and :math:`x - \\sigma` [2]_).  This implies that
-        `numpy.random.normal` is more likely to return samples lying close to
-        the mean, rather than those far away.
+        normal is more likely to return samples lying close to the mean, rather
+        than those far away.
 
         References
         ----------
diff --git a/numpy/random/setup.py b/numpy/random/setup.py
index ce7f0565f..ddb16339b 100644
--- a/numpy/random/setup.py
+++ b/numpy/random/setup.py
@@ -61,32 +61,32 @@ def configuration(parent_package='', top_path=None):
 
     for gen in ['mt19937']:
         # gen.pyx, src/gen/gen.c, src/gen/gen-jump.c
-        config.add_extension(gen,
-                             sources=['{0}.c'.format(gen),
+        config.add_extension('_{0}'.format(gen),
+                             sources=['_{0}.c'.format(gen),
                                       'src/{0}/{0}.c'.format(gen),
                                       'src/{0}/{0}-jump.c'.format(gen)],
                              include_dirs=['.', 'src', join('src', gen)],
                              libraries=EXTRA_LIBRARIES,
                              extra_compile_args=EXTRA_COMPILE_ARGS,
                              extra_link_args=EXTRA_LINK_ARGS,
-                             depends=['%s.pyx' % gen],
+                             depends=['_%s.pyx' % gen],
                              define_macros=defs,
                              )
     for gen in ['philox', 'pcg64', 'sfc64']:
         # gen.pyx, src/gen/gen.c
         _defs = defs + PCG64_DEFS if gen == 'pcg64' else defs
-        config.add_extension(gen,
-                             sources=['{0}.c'.format(gen),
+        config.add_extension('_{0}'.format(gen),
+                             sources=['_{0}.c'.format(gen),
                                       'src/{0}/{0}.c'.format(gen)],
                              include_dirs=['.', 'src', join('src', gen)],
                              libraries=EXTRA_LIBRARIES,
                              extra_compile_args=EXTRA_COMPILE_ARGS,
                              extra_link_args=EXTRA_LINK_ARGS,
-                             depends=['%s.pyx' % gen, 'bit_generator.pyx',
+                             depends=['_%s.pyx' % gen, 'bit_generator.pyx',
                                       'bit_generator.pxd'],
                              define_macros=_defs,
                              )
-    for gen in ['common', 'bit_generator']:
+    for gen in ['_common', '_bit_generator']:
         # gen.pyx
         config.add_extension(gen,
                              sources=['{0}.c'.format(gen)],
@@ -102,7 +102,7 @@ def configuration(parent_package='', top_path=None):
         'src/distributions/distributions.c',
         'src/distributions/random_hypergeometric.c',
     ]
-    for gen in ['generator', 'bounded_integers']:
+    for gen in ['_generator', '_bounded_integers']:
         # gen.pyx, src/distributions/distributions.c
         config.add_extension(gen,
                              sources=['{0}.c'.format(gen)] + other_srcs,
diff --git a/numpy/random/src/aligned_malloc/aligned_malloc.c b/numpy/random/src/aligned_malloc/aligned_malloc.c
deleted file mode 100644
index 6e8192cfb..000000000
--- a/numpy/random/src/aligned_malloc/aligned_malloc.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "aligned_malloc.h"
-
-static NPY_INLINE void *PyArray_realloc_aligned(void *p, size_t n);
-
-static NPY_INLINE void *PyArray_malloc_aligned(size_t n);
-
-static NPY_INLINE void *PyArray_calloc_aligned(size_t n, size_t s);
-
-static NPY_INLINE void PyArray_free_aligned(void *p);
-\ No newline at end of file
diff --git a/numpy/random/src/distributions/distributions.c b/numpy/random/src/distributions/distributions.c
index 1244ffe65..b382ead0b 100644
--- a/numpy/random/src/distributions/distributions.c
+++ b/numpy/random/src/distributions/distributions.c
@@ -1,4 +1,4 @@
-#include "distributions.h"
+#include "include/distributions.h"
 #include "ziggurat_constants.h"
 #include "logfactorial.h"
 
@@ -6,90 +6,52 @@
 #include <intrin.h>
 #endif
 
-/* Random generators for external use */
-float random_float(bitgen_t *bitgen_state) { return next_float(bitgen_state); }
-
-double random_double(bitgen_t *bitgen_state) {
-  return next_double(bitgen_state);
+/* Inline generators for internal use */
+static NPY_INLINE uint32_t next_uint32(bitgen_t *bitgen_state) {
+  return bitgen_state->next_uint32(bitgen_state->state);
 }
-
-static NPY_INLINE double next_standard_exponential(bitgen_t *bitgen_state) {
-  return -log(1.0 - next_double(bitgen_state));
+static NPY_INLINE uint64_t next_uint64(bitgen_t *bitgen_state) {
+  return bitgen_state->next_uint64(bitgen_state->state);
 }
 
-double random_standard_exponential(bitgen_t *bitgen_state) {
-  return next_standard_exponential(bitgen_state);
+static NPY_INLINE float next_float(bitgen_t *bitgen_state) {
+  return (next_uint32(bitgen_state) >> 9) * (1.0f / 8388608.0f);
 }
 
-void random_standard_exponential_fill(bitgen_t *bitgen_state, npy_intp cnt,
-                                      double *out) {
-  npy_intp i;
-  for (i = 0; i < cnt; i++) {
-    out[i] = next_standard_exponential(bitgen_state);
-  }
+/* Random generators for external use */
+float random_standard_uniform_f(bitgen_t *bitgen_state) {
+    return next_float(bitgen_state); 
 }
 
-float random_standard_exponential_f(bitgen_t *bitgen_state) {
-  return -logf(1.0f - next_float(bitgen_state));
+double random_standard_uniform(bitgen_t *bitgen_state) {
+    return next_double(bitgen_state);
 }
 
-void random_double_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) {
+void random_standard_uniform_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) {
   npy_intp i;
   for (i = 0; i < cnt; i++) {
     out[i] = next_double(bitgen_state);
   }
 }
-#if 0
-double random_gauss(bitgen_t *bitgen_state) {
-  if (bitgen_state->has_gauss) {
-    const double temp = bitgen_state->gauss;
-    bitgen_state->has_gauss = false;
-    bitgen_state->gauss = 0.0;
-    return temp;
-  } else {
-    double f, x1, x2, r2;
-
-    do {
-      x1 = 2.0 * next_double(bitgen_state) - 1.0;
-      x2 = 2.0 * next_double(bitgen_state) - 1.0;
-      r2 = x1 * x1 + x2 * x2;
-    } while (r2 >= 1.0 || r2 == 0.0);
 
-    /* Polar method, a more efficient version of the Box-Muller approach. */
-    f = sqrt(-2.0 * log(r2) / r2);
-    /* Keep for next call */
-    bitgen_state->gauss = f * x1;
-    bitgen_state->has_gauss = true;
-    return f * x2;
+void random_standard_uniform_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) {
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = next_float(bitgen_state);
   }
 }
 
-float random_gauss_f(bitgen_t *bitgen_state) {
-  if (bitgen_state->has_gauss_f) {
-    const float temp = bitgen_state->gauss_f;
-    bitgen_state->has_gauss_f = false;
-    bitgen_state->gauss_f = 0.0f;
-    return temp;
-  } else {
-    float f, x1, x2, r2;
-
-    do {
-      x1 = 2.0f * next_float(bitgen_state) - 1.0f;
-      x2 = 2.0f * next_float(bitgen_state) - 1.0f;
-      r2 = x1 * x1 + x2 * x2;
-    } while (r2 >= 1.0 || r2 == 0.0);
+double random_standard_exponential(bitgen_t *bitgen_state) {
+    return -log(1.0 - next_double(bitgen_state));
+}
 
-    /* Polar method, a more efficient version of the Box-Muller approach. */
-    f = sqrtf(-2.0f * logf(r2) / r2);
-    /* Keep for next call */
-    bitgen_state->gauss_f = f * x1;
-    bitgen_state->has_gauss_f = true;
-    return f * x2;
+void random_standard_exponential_fill(bitgen_t * bitgen_state, npy_intp cnt, double * out)
+{
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = random_standard_exponential(bitgen_state);
   }
 }
-#endif
-
-static NPY_INLINE double standard_exponential_zig(bitgen_t *bitgen_state);
 
 static double standard_exponential_zig_unlikely(bitgen_t *bitgen_state,
                                                 uint8_t idx, double x) {
@@ -101,11 +63,11 @@ static double standard_exponential_zig_unlikely(bitgen_t *bitgen_state,
              exp(-x)) {
     return x;
   } else {
-    return standard_exponential_zig(bitgen_state);
+    return random_standard_exponential_zig(bitgen_state);
   }
 }
 
-static NPY_INLINE double standard_exponential_zig(bitgen_t *bitgen_state) {
+double random_standard_exponential_zig(bitgen_t *bitgen_state) {
   uint64_t ri;
   uint8_t idx;
   double x;
@@ -120,20 +82,26 @@ static NPY_INLINE double standard_exponential_zig(bitgen_t *bitgen_state) {
   return standard_exponential_zig_unlikely(bitgen_state, idx, x);
 }
 
-double random_standard_exponential_zig(bitgen_t *bitgen_state) {
-  return standard_exponential_zig(bitgen_state);
+void random_standard_exponential_zig_fill(bitgen_t * bitgen_state, npy_intp cnt, double * out)
+{
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = random_standard_exponential_zig(bitgen_state);
+  }
+}
+
+float random_standard_exponential_f(bitgen_t *bitgen_state) {
+  return -logf(1.0f - next_float(bitgen_state));
 }
 
-void random_standard_exponential_zig_fill(bitgen_t *bitgen_state, npy_intp cnt,
-                                          double *out) {
+void random_standard_exponential_fill_f(bitgen_t * bitgen_state, npy_intp cnt, float * out)
+{
   npy_intp i;
   for (i = 0; i < cnt; i++) {
-    out[i] = standard_exponential_zig(bitgen_state);
+    out[i] = random_standard_exponential_f(bitgen_state);
   }
 }
 
-static NPY_INLINE float standard_exponential_zig_f(bitgen_t *bitgen_state);
-
 static float standard_exponential_zig_unlikely_f(bitgen_t *bitgen_state,
                                                  uint8_t idx, float x) {
   if (idx == 0) {
@@ -144,11 +112,11 @@ static float standard_exponential_zig_unlikely_f(bitgen_t *bitgen_state,
              expf(-x)) {
     return x;
   } else {
-    return standard_exponential_zig_f(bitgen_state);
+    return random_standard_exponential_zig_f(bitgen_state);
   }
 }
 
-static NPY_INLINE float standard_exponential_zig_f(bitgen_t *bitgen_state) {
+float random_standard_exponential_zig_f(bitgen_t *bitgen_state) {
   uint32_t ri;
   uint8_t idx;
   float x;
@@ -163,11 +131,15 @@ static NPY_INLINE float standard_exponential_zig_f(bitgen_t *bitgen_state) {
   return standard_exponential_zig_unlikely_f(bitgen_state, idx, x);
 }
 
-float random_standard_exponential_zig_f(bitgen_t *bitgen_state) {
-  return standard_exponential_zig_f(bitgen_state);
+void random_standard_exponential_zig_fill_f(bitgen_t * bitgen_state, npy_intp cnt, float * out)
+{
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = random_standard_exponential_zig_f(bitgen_state);
+  }
 }
 
-static NPY_INLINE double next_gauss_zig(bitgen_t *bitgen_state) {
+double random_standard_normal(bitgen_t *bitgen_state) {
   uint64_t r;
   int sign;
   uint64_t rabs;
@@ -202,18 +174,14 @@ static NPY_INLINE double next_gauss_zig(bitgen_t *bitgen_state) {
   }
 }
 
-double random_gauss_zig(bitgen_t *bitgen_state) {
-  return next_gauss_zig(bitgen_state);
-}
-
-void random_gauss_zig_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) {
+void random_standard_normal_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) {
   npy_intp i;
   for (i = 0; i < cnt; i++) {
-    out[i] = next_gauss_zig(bitgen_state);
+    out[i] = random_standard_normal(bitgen_state);
   }
 }
 
-float random_gauss_zig_f(bitgen_t *bitgen_state) {
+float random_standard_normal_f(bitgen_t *bitgen_state) {
   uint32_t r;
   int sign;
   uint32_t rabs;
@@ -247,101 +215,14 @@ float random_gauss_zig_f(bitgen_t *bitgen_state) {
   }
 }
 
-/*
-static NPY_INLINE double standard_gamma(bitgen_t *bitgen_state, double shape) {
-  double b, c;
-  double U, V, X, Y;
-
-  if (shape == 1.0) {
-    return random_standard_exponential(bitgen_state);
-  } else if (shape < 1.0) {
-    for (;;) {
-      U = next_double(bitgen_state);
-      V = random_standard_exponential(bitgen_state);
-      if (U <= 1.0 - shape) {
-        X = pow(U, 1. / shape);
-        if (X <= V) {
-          return X;
-        }
-      } else {
-        Y = -log((1 - U) / shape);
-        X = pow(1.0 - shape + shape * Y, 1. / shape);
-        if (X <= (V + Y)) {
-          return X;
-        }
-      }
-    }
-  } else {
-    b = shape - 1. / 3.;
-    c = 1. / sqrt(9 * b);
-    for (;;) {
-      do {
-        X = random_gauss(bitgen_state);
-        V = 1.0 + c * X;
-      } while (V <= 0.0);
-
-      V = V * V * V;
-      U = next_double(bitgen_state);
-      if (U < 1.0 - 0.0331 * (X * X) * (X * X))
-        return (b * V);
-      if (log(U) < 0.5 * X * X + b * (1. - V + log(V)))
-        return (b * V);
-    }
-  }
-}
-
-static NPY_INLINE float standard_gamma_float(bitgen_t *bitgen_state, float
-shape) { float b, c; float U, V, X, Y;
-
-  if (shape == 1.0f) {
-    return random_standard_exponential_f(bitgen_state);
-  } else if (shape < 1.0f) {
-    for (;;) {
-      U = next_float(bitgen_state);
-      V = random_standard_exponential_f(bitgen_state);
-      if (U <= 1.0f - shape) {
-        X = powf(U, 1.0f / shape);
-        if (X <= V) {
-          return X;
-        }
-      } else {
-        Y = -logf((1.0f - U) / shape);
-        X = powf(1.0f - shape + shape * Y, 1.0f / shape);
-        if (X <= (V + Y)) {
-          return X;
-        }
-      }
-    }
-  } else {
-    b = shape - 1.0f / 3.0f;
-    c = 1.0f / sqrtf(9.0f * b);
-    for (;;) {
-      do {
-        X = random_gauss_f(bitgen_state);
-        V = 1.0f + c * X;
-      } while (V <= 0.0f);
-
-      V = V * V * V;
-      U = next_float(bitgen_state);
-      if (U < 1.0f - 0.0331f * (X * X) * (X * X))
-        return (b * V);
-      if (logf(U) < 0.5f * X * X + b * (1.0f - V + logf(V)))
-        return (b * V);
-    }
+void random_standard_normal_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) {
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = random_standard_normal_f(bitgen_state);
   }
 }
 
-
-double random_standard_gamma(bitgen_t *bitgen_state, double shape) {
-  return standard_gamma(bitgen_state, shape);
-}
-
-float random_standard_gamma_f(bitgen_t *bitgen_state, float shape) {
-  return standard_gamma_float(bitgen_state, shape);
-}
-*/
-
-static NPY_INLINE double standard_gamma_zig(bitgen_t *bitgen_state,
+double random_standard_gamma(bitgen_t *bitgen_state,
                                             double shape) {
   double b, c;
   double U, V, X, Y;
@@ -372,7 +253,7 @@ static NPY_INLINE double standard_gamma_zig(bitgen_t *bitgen_state,
     c = 1. / sqrt(9 * b);
     for (;;) {
       do {
-        X = random_gauss_zig(bitgen_state);
+        X = random_standard_normal(bitgen_state);
         V = 1.0 + c * X;
       } while (V <= 0.0);
 
@@ -387,7 +268,7 @@ static NPY_INLINE double standard_gamma_zig(bitgen_t *bitgen_state,
   }
 }
 
-static NPY_INLINE float standard_gamma_zig_f(bitgen_t *bitgen_state,
+float random_standard_gamma_f(bitgen_t *bitgen_state,
                                              float shape) {
   float b, c;
   float U, V, X, Y;
@@ -418,7 +299,7 @@ static NPY_INLINE float standard_gamma_zig_f(bitgen_t *bitgen_state,
     c = 1.0f / sqrtf(9.0f * b);
     for (;;) {
       do {
-        X = random_gauss_zig_f(bitgen_state);
+        X = random_standard_normal_f(bitgen_state);
         V = 1.0f + c * X;
       } while (V <= 0.0f);
 
@@ -433,14 +314,6 @@ static NPY_INLINE float standard_gamma_zig_f(bitgen_t *bitgen_state,
   }
 }
 
-double random_standard_gamma_zig(bitgen_t *bitgen_state, double shape) {
-  return standard_gamma_zig(bitgen_state, shape);
-}
-
-float random_standard_gamma_zig_f(bitgen_t *bitgen_state, float shape) {
-  return standard_gamma_zig_f(bitgen_state, shape);
-}
-
 int64_t random_positive_int64(bitgen_t *bitgen_state) {
   return next_uint64(bitgen_state) >> 1;
 }
@@ -470,10 +343,10 @@ uint64_t random_uint(bitgen_t *bitgen_state) {
  * algorithm comes from SPECFUN by Shanjie Zhang and Jianming Jin and their
  * book "Computation of Special Functions", 1996, John Wiley & Sons, Inc.
  *
- * If loggam(k+1) is being used to compute log(k!) for an integer k, consider
+ * If random_loggam(k+1) is being used to compute log(k!) for an integer k, consider
  * using logfactorial(k) instead.
  */
-double loggam(double x) {
+double random_loggam(double x) {
   double x0, x2, xp, gl, gl0;
   RAND_INT_TYPE k, n;
 
@@ -513,12 +386,12 @@ double random_normal(bitgen_t *bitgen_state, double loc, double scale) {
 }
 */
 
-double random_normal_zig(bitgen_t *bitgen_state, double loc, double scale) {
-  return loc + scale * random_gauss_zig(bitgen_state);
+double random_normal(bitgen_t *bitgen_state, double loc, double scale) {
+  return loc + scale * random_standard_normal(bitgen_state);
 }
 
 double random_exponential(bitgen_t *bitgen_state, double scale) {
-  return scale * standard_exponential_zig(bitgen_state);
+  return scale * random_standard_exponential_zig(bitgen_state);
 }
 
 double random_uniform(bitgen_t *bitgen_state, double lower, double range) {
@@ -526,11 +399,11 @@ double random_uniform(bitgen_t *bitgen_state, double lower, double range) {
 }
 
 double random_gamma(bitgen_t *bitgen_state, double shape, double scale) {
-  return scale * random_standard_gamma_zig(bitgen_state, shape);
+  return scale * random_standard_gamma(bitgen_state, shape);
 }
 
-float random_gamma_float(bitgen_t *bitgen_state, float shape, float scale) {
-  return scale * random_standard_gamma_zig_f(bitgen_state, shape);
+float random_gamma_f(bitgen_t *bitgen_state, float shape, float scale) {
+  return scale * random_standard_gamma_f(bitgen_state, shape);
 }
 
 double random_beta(bitgen_t *bitgen_state, double a, double b) {
@@ -562,14 +435,14 @@ double random_beta(bitgen_t *bitgen_state, double a, double b) {
       }
     }
   } else {
-    Ga = random_standard_gamma_zig(bitgen_state, a);
-    Gb = random_standard_gamma_zig(bitgen_state, b);
+    Ga = random_standard_gamma(bitgen_state, a);
+    Gb = random_standard_gamma(bitgen_state, b);
     return Ga / (Ga + Gb);
   }
 }
 
 double random_chisquare(bitgen_t *bitgen_state, double df) {
-  return 2.0 * random_standard_gamma_zig(bitgen_state, df / 2.0);
+  return 2.0 * random_standard_gamma(bitgen_state, df / 2.0);
 }
 
 double random_f(bitgen_t *bitgen_state, double dfnum, double dfden) {
@@ -578,22 +451,22 @@ double random_f(bitgen_t *bitgen_state, double dfnum, double dfden) {
 }
 
 double random_standard_cauchy(bitgen_t *bitgen_state) {
-  return random_gauss_zig(bitgen_state) / random_gauss_zig(bitgen_state);
+  return random_standard_normal(bitgen_state) / random_standard_normal(bitgen_state);
 }
 
 double random_pareto(bitgen_t *bitgen_state, double a) {
-  return exp(standard_exponential_zig(bitgen_state) / a) - 1;
+  return exp(random_standard_exponential_zig(bitgen_state) / a) - 1;
 }
 
 double random_weibull(bitgen_t *bitgen_state, double a) {
   if (a == 0.0) {
     return 0.0;
   }
-  return pow(standard_exponential_zig(bitgen_state), 1. / a);
+  return pow(random_standard_exponential_zig(bitgen_state), 1. / a);
 }
 
 double random_power(bitgen_t *bitgen_state, double a) {
-  return pow(1 - exp(-standard_exponential_zig(bitgen_state)), 1. / a);
+  return pow(1 - exp(-random_standard_exponential_zig(bitgen_state)), 1. / a);
 }
 
 double random_laplace(bitgen_t *bitgen_state, double loc, double scale) {
@@ -634,7 +507,7 @@ double random_logistic(bitgen_t *bitgen_state, double loc, double scale) {
 }
 
 double random_lognormal(bitgen_t *bitgen_state, double mean, double sigma) {
-  return exp(random_normal_zig(bitgen_state, mean, sigma));
+  return exp(random_normal(bitgen_state, mean, sigma));
 }
 
 double random_rayleigh(bitgen_t *bitgen_state, double mode) {
@@ -644,8 +517,8 @@ double random_rayleigh(bitgen_t *bitgen_state, double mode) {
 double random_standard_t(bitgen_t *bitgen_state, double df) {
   double num, denom;
 
-  num = random_gauss_zig(bitgen_state);
-  denom = random_standard_gamma_zig(bitgen_state, df / 2);
+  num = random_standard_normal(bitgen_state);
+  denom = random_standard_gamma(bitgen_state, df / 2);
   return sqrt(df / 2) * num / sqrt(denom);
 }
 
@@ -699,7 +572,7 @@ static RAND_INT_TYPE random_poisson_ptrs(bitgen_t *bitgen_state, double lam) {
     /* log(V) == log(0.0) ok here */
     /* if U==0.0 so that us==0.0, log is ok since always returns */
     if ((log(V) + log(invalpha) - log(a / (us * us) + b)) <=
-        (-lam + k * loglam - loggam(k + 1))) {
+        (-lam + k * loglam - random_loggam(k + 1))) {
       return k;
     }
   }
@@ -934,7 +807,7 @@ double random_noncentral_chisquare(bitgen_t *bitgen_state, double df,
   }
   if (1 < df) {
     const double Chi2 = random_chisquare(bitgen_state, df - 1);
-    const double n = random_gauss_zig(bitgen_state) + sqrt(nonc);
+    const double n = random_standard_normal(bitgen_state) + sqrt(nonc);
     return Chi2 + n * n;
   } else {
     const RAND_INT_TYPE i = random_poisson(bitgen_state, nonc / 2.0);
@@ -953,7 +826,7 @@ double random_wald(bitgen_t *bitgen_state, double mean, double scale) {
   double mu_2l;
 
   mu_2l = mean / (2 * scale);
-  Y = random_gauss_zig(bitgen_state);
+  Y = random_standard_normal(bitgen_state);
   Y = mean * Y * Y;
   X = mean + mu_2l * (Y - sqrt(4 * scale * Y + Y * Y));
   U = next_double(bitgen_state);
@@ -1092,8 +965,8 @@ RAND_INT_TYPE random_zipf(bitgen_t *bitgen_state, double a) {
   while (1) {
     double T, U, V, X;
 
-    U = 1.0 - random_double(bitgen_state);
-    V = random_double(bitgen_state);
+    U = 1.0 - next_double(bitgen_state);
+    V = next_double(bitgen_state);
     X = floor(pow(U, -1.0 / am1));
     /*
      * The real result may be above what can be represented in a signed
diff --git a/numpy/random/src/distributions/random_hypergeometric.c b/numpy/random/src/distributions/random_hypergeometric.c
index 94dc6380f..da5ea9c68 100644
--- a/numpy/random/src/distributions/random_hypergeometric.c
+++ b/numpy/random/src/distributions/random_hypergeometric.c
@@ -1,4 +1,4 @@
-#include "distributions.h"
+#include "include/distributions.h"
 #include "logfactorial.h"
 #include <stdint.h>
 
@@ -188,8 +188,8 @@ static int64_t hypergeometric_hrua(bitgen_t *bitgen_state,
     while (1) {
         double U, V, X, T;
         double gp;
-        U = random_double(bitgen_state);
-        V = random_double(bitgen_state);  // "U star" in Stadlober (1989)
+        U = next_double(bitgen_state);
+        V = next_double(bitgen_state);  // "U star" in Stadlober (1989)
         X = a + h*(V - 0.5) / U;
 
         // fast rejection:
diff --git a/numpy/random/src/legacy/legacy-distributions.c b/numpy/random/src/legacy/legacy-distributions.c
index 684b3d762..fd067fe8d 100644
--- a/numpy/random/src/legacy/legacy-distributions.c
+++ b/numpy/random/src/legacy/legacy-distributions.c
@@ -1,4 +1,4 @@
-#include "legacy-distributions.h"
+#include "include/legacy-distributions.h"
 
 
 static NPY_INLINE double legacy_double(aug_bitgen_t *aug_state) {
@@ -294,8 +294,8 @@ static RAND_INT_TYPE random_hypergeometric_hrua(bitgen_t *bitgen_state,
   d7 = sqrt((double)(popsize - m) * sample * d4 * d5 / (popsize - 1) + 0.5);
   d8 = D1 * d7 + D2;
   d9 = (RAND_INT_TYPE)floor((double)(m + 1) * (mingoodbad + 1) / (popsize + 2));
-  d10 = (loggam(d9 + 1) + loggam(mingoodbad - d9 + 1) + loggam(m - d9 + 1) +
-         loggam(maxgoodbad - m + d9 + 1));
+  d10 = (random_loggam(d9 + 1) + random_loggam(mingoodbad - d9 + 1) +
+         random_loggam(m - d9 + 1) + random_loggam(maxgoodbad - m + d9 + 1));
   d11 = MIN(MIN(m, mingoodbad) + 1.0, floor(d6 + 16 * d7));
   /* 16 for 16-decimal-digit precision in D1 and D2 */
 
@@ -309,8 +309,8 @@ static RAND_INT_TYPE random_hypergeometric_hrua(bitgen_t *bitgen_state,
       continue;
 
     Z = (RAND_INT_TYPE)floor(W);
-    T = d10 - (loggam(Z + 1) + loggam(mingoodbad - Z + 1) + loggam(m - Z + 1) +
-               loggam(maxgoodbad - m + Z + 1));
+    T = d10 - (random_loggam(Z + 1) + random_loggam(mingoodbad - Z + 1) +
+               random_loggam(m - Z + 1) + random_loggam(maxgoodbad - m + Z + 1));
 
     /* fast acceptance: */
     if ((X * (4.0 - X) - 3.0) <= T)
diff --git a/numpy/random/tests/test_direct.py b/numpy/random/tests/test_direct.py
index 0f57c4bd4..34d7bd278 100644
--- a/numpy/random/tests/test_direct.py
+++ b/numpy/random/tests/test_direct.py
@@ -10,7 +10,7 @@ from numpy.random import (
     Generator, MT19937, PCG64, Philox, RandomState, SeedSequence, SFC64,
     default_rng
 )
-from numpy.random.common import interface
+from numpy.random._common import interface
 
 try:
     import cffi  # noqa: F401
@@ -120,7 +120,7 @@ def gauss_from_uint(x, n, bits):
     return gauss[:n]
 
 def test_seedsequence():
-    from numpy.random.bit_generator import (ISeedSequence,
+    from numpy.random._bit_generator import (ISeedSequence,
                                             ISpawnableSeedSequence,
                                             SeedlessSeedSequence)
 
diff --git a/numpy/random/tests/test_randomstate.py b/numpy/random/tests/test_randomstate.py
index a0edc5c23..5131f1839 100644
--- a/numpy/random/tests/test_randomstate.py
+++ b/numpy/random/tests/test_randomstate.py
@@ -11,7 +11,8 @@ from numpy.testing import (
         suppress_warnings
         )
 
-from numpy.random import MT19937, PCG64, mtrand as random
+from numpy.random import MT19937, PCG64
+from numpy import random
 
 INT_FUNCS = {'binomial': (100.0, 0.6),
              'geometric': (.5,),
diff --git a/numpy/random/tests/test_randomstate_regression.py b/numpy/random/tests/test_randomstate_regression.py
index edf32ea97..bdc2214b6 100644
--- a/numpy/random/tests/test_randomstate_regression.py
+++ b/numpy/random/tests/test_randomstate_regression.py
@@ -8,7 +8,7 @@ from numpy.testing import (
 from numpy.compat import long
 import numpy as np
 
-from numpy.random import mtrand as random
+from numpy import random
 
 
 class TestRegression(object):
diff --git a/numpy/random/tests/test_seed_sequence.py b/numpy/random/tests/test_seed_sequence.py
index 8d6d604a2..fe23680ed 100644
--- a/numpy/random/tests/test_seed_sequence.py
+++ b/numpy/random/tests/test_seed_sequence.py
@@ -1,7 +1,7 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 
-from numpy.random.bit_generator import SeedSequence
+from numpy.random import SeedSequence
 
 
 def test_reference_data():
diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
index e3621c0fd..c71d03432 100644
--- a/numpy/tests/test_public_api.py
+++ b/numpy/tests/test_public_api.py
@@ -298,15 +298,7 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "ma.timer_comparison",
     "matrixlib",
     "matrixlib.defmatrix",
-    "random.bit_generator",
-    "random.bounded_integers",
-    "random.common",
-    "random.generator",
-    "random.mt19937",
     "random.mtrand",
-    "random.pcg64",
-    "random.philox",
-    "random.sfc64",
     "testing.print_coercion_tables",
     "testing.utils",
 ]]
diff --git a/tools/travis-test.sh b/tools/travis-test.sh
index 6baa55817..6094f0ee6 100755
--- a/tools/travis-test.sh
+++ b/tools/travis-test.sh
@@ -88,7 +88,7 @@ run_test()
 
   if [ -n "$RUN_FULL_TESTS" ]; then
     export PYTHONWARNINGS="ignore::DeprecationWarning:virtualenv"
-    $PYTHON ../runtests.py -n -v --durations 10 --mode=full $COVERAGE_FLAG
+    $PYTHON -b ../runtests.py -n -v --durations 10 --mode=full $COVERAGE_FLAG
   else
     # disable --durations temporarily, pytest currently aborts
     # when that is used with python3.6-dbg
diff --git a/tox.ini b/tox.ini
index a38a03c97..5a6d074aa 100644
--- a/tox.ini
+++ b/tox.ini
@@ -32,7 +32,7 @@ envlist =
 [testenv]
 deps= -Ur{toxinidir}/test_requirements.txt
 changedir={envdir}
-commands={envpython} {toxinidir}/runtests.py --mode=full {posargs:}
+commands={envpython} -b {toxinidir}/runtests.py --mode=full {posargs:}
 
 [testenv:py37-not-relaxed-strides]
 basepython=python3.7